1#pragma once
2
3#include "llama.h"
4
5#include <cstdint>
6
7#define LLAMA_MAX_SEQ 256
8
9struct llama_cparams {
10 uint32_t n_ctx; // context size used during inference
11 uint32_t n_ctx_seq; // context for a single sequence
12 uint32_t n_batch;
13 uint32_t n_ubatch;
14 uint32_t n_seq_max;
15 int32_t n_threads; // number of threads to use for generation
16 int32_t n_threads_batch; // number of threads to use for batch processing
17
18 float rope_freq_base;
19 float rope_freq_scale;
20
21 uint32_t n_ctx_orig_yarn;
22 // These hyperparameters are not exposed in GGUF, because all
23 // existing YaRN models use the same values for them.
24 float yarn_ext_factor;
25 float yarn_attn_factor;
26 float yarn_beta_fast;
27 float yarn_beta_slow;
28
29 bool embeddings;
30 bool causal_attn;
31 bool offload_kqv;
32 bool flash_attn;
33 bool auto_fa;
34 bool no_perf;
35 bool warmup;
36 bool op_offload;
37 bool kv_unified;
38 bool pipeline_parallel;
39
40 enum llama_pooling_type pooling_type;
41
42 ggml_backend_sched_eval_callback cb_eval;
43 void * cb_eval_user_data;
44};