1#pragma once
 2
 3#include "llama.h"
 4
 5#include <cstdint>
 6
 7#define LLAMA_MAX_SEQ 256
 8
 9struct llama_cparams {
10    uint32_t n_ctx;           // context size used during inference
11    uint32_t n_ctx_seq;       // context for a single sequence
12    uint32_t n_batch;
13    uint32_t n_ubatch;
14    uint32_t n_seq_max;
15    int32_t  n_threads;       // number of threads to use for generation
16    int32_t  n_threads_batch; // number of threads to use for batch processing
17
18    float rope_freq_base;
19    float rope_freq_scale;
20
21    uint32_t n_ctx_orig_yarn;
22    // These hyperparameters are not exposed in GGUF, because all
23    // existing YaRN models use the same values for them.
24    float yarn_ext_factor;
25    float yarn_attn_factor;
26    float yarn_beta_fast;
27    float yarn_beta_slow;
28
29    bool embeddings;
30    bool causal_attn;
31    bool offload_kqv;
32    bool flash_attn;
33    bool auto_fa;
34    bool no_perf;
35    bool warmup;
36    bool op_offload;
37    bool kv_unified;
38    bool pipeline_parallel;
39
40    enum llama_pooling_type pooling_type;
41
42    ggml_backend_sched_eval_callback cb_eval;
43    void * cb_eval_user_data;
44};