1// Various helper functions and utilities
2
3#pragma once
4
5#include "ggml-opt.h"
6#include "llama-cpp.h"
7
8#include <set>
9#include <sstream>
10#include <string>
11#include <string_view>
12#include <vector>
13#include <map>
14
15#if defined(_WIN32) && !defined(_WIN32_WINNT)
16#define _WIN32_WINNT 0x0A00
17#endif
18
19#ifdef _WIN32
20#define DIRECTORY_SEPARATOR '\\'
21#else
22#define DIRECTORY_SEPARATOR '/'
23#endif // _WIN32
24
25#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
26#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
27
28#define print_build_info() do { \
29 fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
30 fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
31} while(0)
32
33struct common_time_meas {
34 common_time_meas(int64_t & t_acc, bool disable = false);
35 ~common_time_meas();
36
37 const int64_t t_start_us;
38
39 int64_t & t_acc;
40};
41
42struct common_adapter_lora_info {
43 std::string path;
44 float scale;
45
46 std::string task_name;
47 std::string prompt_prefix;
48
49 struct llama_adapter_lora * ptr;
50};
51
52using llama_tokens = std::vector<llama_token>;
53
54// build info
55extern int LLAMA_BUILD_NUMBER;
56extern const char * LLAMA_COMMIT;
57extern const char * LLAMA_COMPILER;
58extern const char * LLAMA_BUILD_TARGET;
59
60const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
61
62struct common_control_vector_load_info;
63
64//
65// CPU utils
66//
67
68struct cpu_params {
69 int n_threads = -1;
70 bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
71 bool mask_valid = false; // Default: any CPU
72 enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
73 bool strict_cpu = false; // Use strict CPU placement
74 uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
75};
76
77int32_t cpu_get_num_physical_cores();
78int32_t cpu_get_num_math();
79
80//
81// Common params
82//
83
84enum llama_example {
85 LLAMA_EXAMPLE_BATCHED,
86 LLAMA_EXAMPLE_DEBUG,
87 LLAMA_EXAMPLE_COMMON,
88 LLAMA_EXAMPLE_SPECULATIVE,
89 LLAMA_EXAMPLE_COMPLETION,
90 LLAMA_EXAMPLE_CLI,
91 LLAMA_EXAMPLE_EMBEDDING,
92 LLAMA_EXAMPLE_PERPLEXITY,
93 LLAMA_EXAMPLE_RETRIEVAL,
94 LLAMA_EXAMPLE_PASSKEY,
95 LLAMA_EXAMPLE_IMATRIX,
96 LLAMA_EXAMPLE_BENCH,
97 LLAMA_EXAMPLE_SERVER,
98 LLAMA_EXAMPLE_CVECTOR_GENERATOR,
99 LLAMA_EXAMPLE_EXPORT_LORA,
100 LLAMA_EXAMPLE_MTMD,
101 LLAMA_EXAMPLE_LOOKUP,
102 LLAMA_EXAMPLE_PARALLEL,
103 LLAMA_EXAMPLE_TTS,
104 LLAMA_EXAMPLE_DIFFUSION,
105 LLAMA_EXAMPLE_FINETUNE,
106 LLAMA_EXAMPLE_FIT_PARAMS,
107
108 LLAMA_EXAMPLE_COUNT,
109};
110
111enum common_sampler_type {
112 COMMON_SAMPLER_TYPE_NONE = 0,
113 COMMON_SAMPLER_TYPE_DRY = 1,
114 COMMON_SAMPLER_TYPE_TOP_K = 2,
115 COMMON_SAMPLER_TYPE_TOP_P = 3,
116 COMMON_SAMPLER_TYPE_MIN_P = 4,
117 //COMMON_SAMPLER_TYPE_TFS_Z = 5,
118 COMMON_SAMPLER_TYPE_TYPICAL_P = 6,
119 COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
120 COMMON_SAMPLER_TYPE_XTC = 8,
121 COMMON_SAMPLER_TYPE_INFILL = 9,
122 COMMON_SAMPLER_TYPE_PENALTIES = 10,
123 COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
124 COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
125};
126
127// dimensionality reduction methods, used by cvector-generator
128enum dimre_method {
129 DIMRE_METHOD_PCA,
130 DIMRE_METHOD_MEAN,
131};
132
133enum common_conversation_mode {
134 COMMON_CONVERSATION_MODE_DISABLED = 0,
135 COMMON_CONVERSATION_MODE_ENABLED = 1,
136 COMMON_CONVERSATION_MODE_AUTO = 2,
137};
138
139enum common_grammar_trigger_type {
140 COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
141 COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
142 COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
143 COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
144};
145
146struct common_grammar_trigger {
147 common_grammar_trigger_type type;
148 std::string value;
149 llama_token token = LLAMA_TOKEN_NULL;
150};
151
152enum common_params_sampling_config : uint64_t {
153 COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS = 1 << 0,
154 COMMON_PARAMS_SAMPLING_CONFIG_TOP_K = 1 << 1,
155 COMMON_PARAMS_SAMPLING_CONFIG_TOP_P = 1 << 2,
156 COMMON_PARAMS_SAMPLING_CONFIG_MIN_P = 1 << 3,
157 COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
158 COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD = 1 << 5,
159 COMMON_PARAMS_SAMPLING_CONFIG_TEMP = 1 << 6,
160 COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N = 1 << 7,
161 COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT = 1 << 8,
162 COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT = 1 << 9,
163 COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU = 1 << 10,
164 COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA = 1 << 11,
165};
166
167enum common_speculative_type {
168 COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding
169 COMMON_SPECULATIVE_TYPE_DRAFT, // draft model
170 COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model
171 COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding
172 COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
173 COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
174 COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
175 COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache
176 COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type
177};
178
179// sampling parameters
180struct common_params_sampling {
181 uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
182
183 int32_t n_prev = 64; // number of previous tokens to remember
184 int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
185 int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
186 int32_t top_k = 40; // <= 0 to use vocab size
187 float top_p = 0.95f; // 1.0 = disabled
188 float min_p = 0.05f; // 0.0 = disabled
189 float xtc_probability = 0.00f; // 0.0 = disabled
190 float xtc_threshold = 0.10f; // > 0.5 disables XTC
191 float typ_p = 1.00f; // typical_p, 1.0 = disabled
192 float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
193 float dynatemp_range = 0.00f; // 0.0 = disabled
194 float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
195 int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
196 float penalty_repeat = 1.00f; // 1.0 = disabled
197 float penalty_freq = 0.00f; // 0.0 = disabled
198 float penalty_present = 0.00f; // 0.0 = disabled
199 float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
200 float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
201 int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
202 int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
203 float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
204 float adaptive_decay = 0.90f; // EMA decay for adaptation; history โ 1/(1-decay) tokens (0.0 - 0.99)
205 int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
206 float top_n_sigma = -1.00f; // -1.0 = disabled
207 float mirostat_tau = 5.00f; // target entropy
208 float mirostat_eta = 0.10f; // learning rate
209 bool ignore_eos = false;
210 bool no_perf = false; // disable performance metrics
211 bool timing_per_token = false;
212
213 uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
214
215 std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
216
217 std::vector<enum common_sampler_type> samplers = {
218 COMMON_SAMPLER_TYPE_PENALTIES,
219 COMMON_SAMPLER_TYPE_DRY,
220 COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
221 COMMON_SAMPLER_TYPE_TOP_K,
222 COMMON_SAMPLER_TYPE_TYPICAL_P,
223 COMMON_SAMPLER_TYPE_TOP_P,
224 COMMON_SAMPLER_TYPE_MIN_P,
225 COMMON_SAMPLER_TYPE_XTC,
226 COMMON_SAMPLER_TYPE_TEMPERATURE,
227 };
228
229 std::string grammar; // optional BNF-like grammar to constrain sampling
230 bool grammar_lazy = false;
231 std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
232 std::set<llama_token> preserved_tokens;
233
234 std::vector<llama_logit_bias> logit_bias; // logit biases to apply
235 std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
236
237 bool backend_sampling = false;
238
239 bool has_logit_bias() const {
240 return !logit_bias.empty();
241 }
242
243 // print the parameters into a string
244 std::string print() const;
245};
246
247struct common_params_model {
248 std::string path = ""; // model local path // NOLINT
249 std::string url = ""; // model url to download // NOLINT
250 std::string hf_repo = ""; // HF repo // NOLINT
251 std::string hf_file = ""; // HF file // NOLINT
252 std::string docker_repo = ""; // Docker repo // NOLINT
253 std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
254};
255
256struct common_ngram_mod;
257
258struct common_params_speculative {
259 common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
260
261 // general-purpose speculative decoding parameters
262
263 int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
264 int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
265 float p_split = 0.1f; // speculative decoding split probability
266 float p_min = 0.75f; // minimum speculative decoding probability (greedy)
267
268 // ngram-based speculative decoding
269
270 uint16_t ngram_size_n = 12; // ngram size for lookup
271 uint16_t ngram_size_m = 48; // mgram size for speculative tokens
272 uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
273
274 std::shared_ptr<common_ngram_mod> ngram_mod;
275
276 std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
277 std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
278
279 // draft-model speculative decoding
280
281 struct common_params_model mparams_dft;
282
283 llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
284
285 llama_context_params cparams_dft; // these are the parameters for the draft llama_context
286
287 int32_t n_ctx = 0; // draft context size
288 int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
289
290 ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
291 ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
292
293 struct cpu_params cpuparams;
294 struct cpu_params cpuparams_batch;
295
296 std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
297
298 std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
299 std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
300
301 bool has_dft() const {
302 return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
303 }
304};
305
306struct common_params_vocoder {
307 struct common_params_model model;
308
309 std::string speaker_file = ""; // speaker file path // NOLINT
310
311 bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
312};
313
314struct common_params_diffusion {
315 int32_t steps = 128;
316 bool visual_mode = false;
317
318 float eps = 0; // epsilon for timesteps
319 int32_t block_length = 0; // block length for generation
320
321 int32_t algorithm = 4; // default algorithm: low-confidence
322 float alg_temp = 0.0f; // algorithm temperature
323
324 float cfg_scale = 0; // classifier-free guidance scale
325 bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
326};
327
328// reasoning API response format (not to be confused as chat template's reasoning format)
329// only used by server
330enum common_reasoning_format {
331 COMMON_REASONING_FORMAT_NONE,
332 COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
333 COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
334 COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
335 // do not extend this enum unless you absolutely have to
336 // in most cases, use COMMON_REASONING_FORMAT_AUTO
337 // see: https://github.com/ggml-org/llama.cpp/pull/15408
338};
339
340
341struct lr_opt {
342 float lr0 = 1e-5; // learning rate at first epoch
343 float lr_min = -1;
344 float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
345 float scale_epoch = 0;
346 float wd = 0;
347 unsigned epochs = 2;
348
349 unsigned epoch; // set by optimizer outer (epochs) loop
350 // learning rate decay - constant LR per epoch only for now
351 float get_lr(float e) const;
352 float get_lr() const { return get_lr(epoch); }
353 // must call after arg parse, before get_lr
354 void init();
355};
356
357struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
358
359struct common_params {
360 int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
361 int32_t n_ctx = 0; // context size, 0 == context the model was trained with
362 int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
363 int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
364 int32_t n_keep = 0; // number of tokens to keep from initial prompt
365 int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
366 int32_t n_parallel = 1; // number of parallel sequences to decode
367 int32_t n_sequences = 1; // number of sequences to decode
368 int32_t grp_attn_n = 1; // group-attention factor
369 int32_t grp_attn_w = 512; // group-attention width
370 int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
371 float rope_freq_base = 0.0f; // RoPE base frequency
372 float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
373 float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
374 float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
375 float yarn_beta_fast = -1.0f; // YaRN low correction dim
376 float yarn_beta_slow = -1.0f; // YaRN high correction dim
377 int32_t yarn_orig_ctx = 0; // YaRN original context length
378
379 // offload params
380 std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
381
382 int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
383 int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
384 float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
385 bool fit_params = true; // whether to fit unset model/context parameters to free device memory
386 int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
387
388 // margin per device in bytes for fitting parameters to free memory:
389 std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
390
391 enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
392
393 struct cpu_params cpuparams;
394 struct cpu_params cpuparams_batch;
395
396 ggml_backend_sched_eval_callback cb_eval = nullptr;
397 void * cb_eval_user_data = nullptr;
398
399 ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
400
401 enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
402 enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
403 enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
404 enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
405
406 struct common_params_sampling sampling;
407 struct common_params_speculative speculative;
408 struct common_params_vocoder vocoder;
409 struct common_params_diffusion diffusion;
410
411 struct common_params_model model;
412
413 std::string model_alias = ""; // model alias // NOLINT
414 std::string hf_token = ""; // HF token // NOLINT
415 std::string prompt = ""; // NOLINT
416 std::string system_prompt = ""; // NOLINT
417 std::string prompt_file = ""; // store the external prompt file name // NOLINT
418 std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
419 std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
420 std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
421 std::string logits_file = ""; // file for saving *all* logits // NOLINT
422
423 // llama-debug specific options
424 std::string logits_output_dir = "data"; // directory for saving logits output files // NOLINT
425 bool save_logits = false; // whether to save logits to files // NOLINT
426 std::vector<std::string> tensor_filter; // filter tensor names for debug output (regex) // NOLINT
427
428 std::vector<std::string> in_files; // all input files
429 std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
430 std::vector<llama_model_kv_override> kv_overrides;
431 std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
432
433 bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
434 std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
435
436 std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
437
438 int32_t verbosity = 3; // LOG_LEVEL_INFO
439 int32_t control_vector_layer_start = -1; // layer range for control vector
440 int32_t control_vector_layer_end = -1; // layer range for control vector
441 bool offline = false;
442
443 int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
444 int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
445 // (which is more convenient to use for plotting)
446 //
447 bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
448 size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
449
450 bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
451 size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
452
453 bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
454 size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
455
456 bool kl_divergence = false; // compute KL divergence
457
458 bool usage = false; // print usage
459 bool completion = false; // print source-able completion script
460 bool use_color = false; // use color to distinguish generations and inputs
461 bool special = false; // enable special token output
462 bool interactive = false; // interactive mode
463 bool interactive_first = false; // wait for user input immediately
464 bool prompt_cache_all = false; // save user input and generations to prompt cache
465 bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
466
467 bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
468 bool multiline_input = false; // reverse the usage of `\`
469 bool simple_io = false; // improves compatibility with subprocesses and limited consoles
470 bool cont_batching = true; // insert new sequences for decoding on-the-fly
471 bool no_perf = false; // disable performance metrics
472 bool show_timings = true; // show timing information on CLI
473 bool ctx_shift = false; // context shift on infinite text generation
474 bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
475 bool kv_unified = false; // enable unified KV cache
476
477 bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
478 bool use_mmap = true; // enable mmap to use filesystem cache
479 bool use_direct_io = false; // read from disk without buffering
480 bool use_mlock = false; // use mlock to keep model in memory
481 bool verbose_prompt = false; // print prompt tokens before generation
482 bool display_prompt = true; // print prompt before generation
483 bool no_kv_offload = false; // disable KV offloading
484 bool warmup = true; // warmup run
485 bool check_tensors = false; // validate tensor data
486 bool no_op_offload = false; // globally disable offload host tensor operations to device
487 bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
488 bool no_host = false; // bypass host buffer allowing extra buffers to be used
489
490 bool single_turn = false; // single turn chat conversation
491
492 ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
493 ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
494
495 common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
496
497 // multimodal models (see tools/mtmd)
498 struct common_params_model mmproj;
499 bool mmproj_use_gpu = true; // use GPU for multimodal model
500 bool no_mmproj = false; // explicitly disable multimodal model
501 std::vector<std::string> image; // path to image file(s)
502 int image_min_tokens = -1;
503 int image_max_tokens = -1;
504
505 // finetune
506 struct lr_opt lr;
507 enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
508 float val_split = 0.05f; // fraction of the data used for the validation set
509
510 // embedding
511 bool embedding = false; // get only sentence embedding
512 int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
513 std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
514 std::string embd_sep = "\n"; // separator of embeddings
515 std::string cls_sep = "\t"; // separator of classification sequences
516
517 // server params
518 int32_t port = 8080; // server listens on this network port
519 int32_t timeout_read = 600; // http read timeout in seconds
520 int32_t timeout_write = timeout_read; // http write timeout in seconds
521 int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
522 int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
523 bool cache_prompt = true; // whether to enable prompt caching
524 int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
525 int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
526
527 std::string hostname = "127.0.0.1";
528 std::string public_path = ""; // NOLINT
529 std::string api_prefix = ""; // NOLINT
530 std::string chat_template = ""; // NOLINT
531 bool use_jinja = true; // NOLINT
532 bool enable_chat_template = true;
533 common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
534 int reasoning_budget = -1;
535 bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
536 int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
537
538 std::vector<std::string> api_keys;
539
540 std::string ssl_file_key = ""; // NOLINT
541 std::string ssl_file_cert = ""; // NOLINT
542
543 std::map<std::string, std::string> default_template_kwargs;
544
545 // webui configs
546 bool webui = true;
547 std::string webui_config_json;
548
549 // "advanced" endpoints are disabled by default for better security
550 bool endpoint_slots = true;
551 bool endpoint_props = false; // only control POST requests, not GET
552 bool endpoint_metrics = false;
553
554 // router server configs
555 std::string models_dir = ""; // directory containing models for the router server
556 std::string models_preset = ""; // directory containing model presets for the router server
557 int models_max = 4; // maximum number of models to load simultaneously
558 bool models_autoload = true; // automatically load models when requested via the router server
559
560 bool log_json = false;
561
562 std::string slot_save_path;
563 std::string media_path; // path to directory for loading media files
564
565 float slot_prompt_similarity = 0.1f;
566
567 // batched-bench params
568 bool is_pp_shared = false;
569 bool is_tg_separate = false;
570
571 std::vector<int32_t> n_pp;
572 std::vector<int32_t> n_tg;
573 std::vector<int32_t> n_pl;
574
575 // retrieval params
576 std::vector<std::string> context_files; // context files to embed
577
578 int32_t chunk_size = 64; // chunk size for context embedding
579
580 std::string chunk_separator = "\n"; // chunk separator for context embedding
581
582 // passkey params
583 int32_t n_junk = 250; // number of times to repeat the junk text
584 int32_t i_pos = -1; // position of the passkey in the junk text
585
586 // imatrix params
587 int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
588 int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
589 int32_t i_chunk = 0; // start processing from this chunk
590 int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
591
592 bool process_output = false; // collect data for the output tensor
593 bool compute_ppl = true; // whether to compute perplexity
594 bool show_statistics = false; // show imatrix statistics per tensor
595 bool parse_special = false; // whether to parse special tokens during imatrix tokenization
596
597 // cvector-generator params
598 int n_pca_batch = 100;
599 int n_pca_iterations = 1000;
600 dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
601 std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
602 std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
603
604 bool spm_infill = false; // suffix/prefix/middle pattern for infill
605
606 // batched-bench params
607 bool batched_bench_output_jsonl = false;
608
609 // common params
610 std::string out_file; // output filename for all example programs
611 // optional callback for model loading progress and cancellation:
612 // called with a progress value between 0.0 and 1.0.
613 // return false from callback to abort model loading or true to continue
614 llama_progress_callback load_progress_callback = NULL;
615 void * load_progress_callback_user_data = NULL;
616};
617
618// call once at the start of a program if it uses libcommon
619// initializes the logging system and prints info about the build
620void common_init();
621
622std::string common_params_get_system_info(const common_params & params);
623
624bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
625bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
626void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
627bool set_process_priority(enum ggml_sched_priority prio);
628
629//
630// String utils
631//
632
633#ifdef __GNUC__
634# if defined(__MINGW32__) && !defined(__clang__)
635# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
636# else
637# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
638# endif
639#else
640# define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
641#endif
642
643LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
644std::string string_format(const char * fmt, ...);
645
646std::string string_strip(const std::string & str);
647std::string string_get_sortable_timestamp();
648
649std::string string_join(const std::vector<std::string> & values, const std::string & separator);
650std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
651std::string string_repeat(const std::string & str, size_t n);
652
653void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
654
655std::string regex_escape(const std::string & s);
656
657template<class T>
658static std::vector<T> string_split(const std::string & str, char delim) {
659 static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
660 std::vector<T> values;
661 std::istringstream str_stream(str);
662 std::string token;
663 while (std::getline(str_stream, token, delim)) {
664 T value;
665 std::istringstream token_stream(token);
666 token_stream >> value;
667 values.push_back(value);
668 }
669 return values;
670}
671
672template<>
673std::vector<std::string> string_split<std::string>(const std::string & input, char separator)
674{
675 std::vector<std::string> parts;
676 size_t begin_pos = 0;
677 size_t separator_pos = input.find(separator);
678 while (separator_pos != std::string::npos) {
679 std::string part = input.substr(begin_pos, separator_pos - begin_pos);
680 parts.emplace_back(part);
681 begin_pos = separator_pos + 1;
682 separator_pos = input.find(separator, begin_pos);
683 }
684 parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
685 return parts;
686}
687
688static bool string_starts_with(const std::string & str,
689 const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
690 return str.rfind(prefix, 0) == 0;
691}
692
693// While we wait for C++20's std::string::ends_with...
694bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
695bool string_remove_suffix(std::string & str, const std::string_view & suffix);
696size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
697
698bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
699void string_process_escapes(std::string & input);
700
701std::string string_from(bool value);
702std::string string_from(const std::vector<int> & values);
703std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
704std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
705
706//
707// Filesystem utils
708//
709
710bool fs_validate_filename(const std::string & filename, bool allow_subdirs = false);
711bool fs_create_directory_with_parents(const std::string & path);
712bool fs_is_directory(const std::string & path);
713
714std::string fs_get_cache_directory();
715std::string fs_get_cache_file(const std::string & filename);
716
717struct common_file_info {
718 std::string path;
719 std::string name;
720 size_t size = 0; // in bytes
721 bool is_dir = false;
722};
723std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
724
725//
726// TTY utils
727//
728
729// Auto-detect if colors can be enabled based on terminal and environment
730bool tty_can_use_colors();
731
732//
733// Model utils
734//
735
736struct common_sampler;
737
738// note: defines the model, context, samplers, ets. lifetimes
739struct common_init_result {
740 common_init_result(common_params & params);
741 ~common_init_result();
742
743 llama_model * model();
744 llama_context * context();
745
746 common_sampler * sampler(llama_seq_id seq_id);
747 void reset_samplers();
748
749 std::vector<llama_adapter_lora_ptr> & lora();
750
751private:
752 struct impl;
753 std::unique_ptr<impl> pimpl;
754};
755
756using common_init_result_ptr = std::unique_ptr<common_init_result>;
757
758common_init_result_ptr common_init_from_params(common_params & params);
759
760struct llama_model_params common_model_params_to_llama ( common_params & params);
761struct llama_context_params common_context_params_to_llama(const common_params & params);
762struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
763
764// clear LoRA adapters from context, then apply new list of adapters
765void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
766
767std::string get_model_endpoint();
768
769//
770// Batch utils
771//
772
773void common_batch_clear(struct llama_batch & batch);
774
775void common_batch_add(
776 struct llama_batch & batch,
777 llama_token id,
778 llama_pos pos,
779 const std::vector<llama_seq_id> & seq_ids,
780 bool logits);
781
782//
783// Vocab utils
784//
785
786// tokenizes a string into a vector of tokens
787// should work similar to Python's `tokenizer.encode`
788std::vector<llama_token> common_tokenize(
789 const struct llama_context * ctx,
790 const std::string & text,
791 bool add_special,
792 bool parse_special = false);
793
794std::vector<llama_token> common_tokenize(
795 const struct llama_vocab * vocab,
796 const std::string & text,
797 bool add_special,
798 bool parse_special = false);
799
800// tokenizes a token into a piece, optionally renders special/control tokens
801// should work similar to Python's `tokenizer.id_to_piece`
802std::string common_token_to_piece(
803 const struct llama_context * ctx,
804 llama_token token,
805 bool special = true);
806
807std::string common_token_to_piece(
808 const struct llama_vocab * vocab,
809 llama_token token,
810 bool special = true);
811
812// detokenizes a vector of tokens into a string
813// should work similar to Python's `tokenizer.decode`
814// optionally renders special/control tokens
815std::string common_detokenize(
816 const struct llama_context * ctx,
817 const std::vector<llama_token> & tokens,
818 bool special = true);
819
820std::string common_detokenize(
821 const struct llama_vocab * vocab,
822 const std::vector<llama_token> & tokens,
823 bool special = true);
824
825//
826// Embedding utils
827//
828
829// TODO: repace embd_norm with an enum
830void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
831
832float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
833
834//
835// Control vector utils
836//
837
838struct common_control_vector_data {
839 int n_embd;
840
841 // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
842 std::vector<float> data;
843};
844
845struct common_control_vector_load_info {
846 float strength;
847
848 std::string fname;
849};
850
851// Load control vectors, scale each by strength, and add them together.
852// On error, returns {-1, empty}
853common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
854
855//
856// Split utils
857//
858
859namespace {
860
861const char * const LLM_KV_SPLIT_NO = "split.no";
862const char * const LLM_KV_SPLIT_COUNT = "split.count";
863const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
864
865}
866
867//
868// MoE utils
869//
870
871const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
872
873static std::string llm_ffn_exps_block_regex(int idx) {
874 return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
875}
876
877static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
878 return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
879}
880
881//
882// training utils
883//
884
885ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
886
887// "adamw" or "sgd" (case insensitive)
888enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);