summaryrefslogtreecommitdiff
path: root/llama.cpp/src/llama-cparams.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/src/llama-cparams.h')
-rw-r--r--llama.cpp/src/llama-cparams.h44
1 files changed, 44 insertions, 0 deletions
diff --git a/llama.cpp/src/llama-cparams.h b/llama.cpp/src/llama-cparams.h
new file mode 100644
index 0000000..2da3bbd
--- /dev/null
+++ b/llama.cpp/src/llama-cparams.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "llama.h"
+
+#include <cstdint>
+
+#define LLAMA_MAX_SEQ 256
+
+struct llama_cparams {
+ uint32_t n_ctx; // context size used during inference
+ uint32_t n_ctx_seq; // context for a single sequence
+ uint32_t n_batch;
+ uint32_t n_ubatch;
+ uint32_t n_seq_max;
+ int32_t n_threads; // number of threads to use for generation
+ int32_t n_threads_batch; // number of threads to use for batch processing
+
+ float rope_freq_base;
+ float rope_freq_scale;
+
+ uint32_t n_ctx_orig_yarn;
+ // These hyperparameters are not exposed in GGUF, because all
+ // existing YaRN models use the same values for them.
+ float yarn_ext_factor;
+ float yarn_attn_factor;
+ float yarn_beta_fast;
+ float yarn_beta_slow;
+
+ bool embeddings;
+ bool causal_attn;
+ bool offload_kqv;
+ bool flash_attn;
+ bool auto_fa;
+ bool no_perf;
+ bool warmup;
+ bool op_offload;
+ bool kv_unified;
+ bool pipeline_parallel;
+
+ enum llama_pooling_type pooling_type;
+
+ ggml_backend_sched_eval_callback cb_eval;
+ void * cb_eval_user_data;
+};