summaryrefslogtreecommitdiff
path: root/llama.cpp/include
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/include')
-rw-r--r--llama.cpp/include/llama-cpp.h32
-rw-r--r--llama.cpp/include/llama.h1568
2 files changed, 1600 insertions, 0 deletions
diff --git a/llama.cpp/include/llama-cpp.h b/llama.cpp/include/llama-cpp.h
new file mode 100644
index 0000000..807e77f
--- /dev/null
+++ b/llama.cpp/include/llama-cpp.h
@@ -0,0 +1,32 @@
1#pragma once
2
3#ifndef __cplusplus
4#error "This header is for C++ only"
5#endif
6
7#include <memory>
8
9#include "llama.h"
10
11struct llama_model_deleter {
12 void operator()(llama_model * model) { llama_model_free(model); }
13};
14
15struct llama_context_deleter {
16 void operator()(llama_context * context) { llama_free(context); }
17};
18
19struct llama_sampler_deleter {
20 void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21};
22
23struct llama_adapter_lora_deleter {
24 void operator()(llama_adapter_lora *) {
25 // llama_adapter_lora_free is deprecated
26 }
27};
28
29typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
30typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
31typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
32typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;
diff --git a/llama.cpp/include/llama.h b/llama.cpp/include/llama.h
new file mode 100644
index 0000000..46c3672
--- /dev/null
+++ b/llama.cpp/include/llama.h
@@ -0,0 +1,1568 @@
1#ifndef LLAMA_H
2#define LLAMA_H
3
4#include "ggml.h"
5#include "ggml-cpu.h"
6#include "ggml-backend.h"
7#include "ggml-opt.h"
8
9#include <stddef.h>
10#include <stdint.h>
11#include <stdio.h>
12#include <stdbool.h>
13
14#ifdef LLAMA_SHARED
15# if defined(_WIN32) && !defined(__MINGW32__)
16# ifdef LLAMA_BUILD
17# define LLAMA_API __declspec(dllexport)
18# else
19# define LLAMA_API __declspec(dllimport)
20# endif
21# else
22# define LLAMA_API __attribute__ ((visibility ("default")))
23# endif
24#else
25# define LLAMA_API
26#endif
27
28#ifdef __GNUC__
29# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
30#elif defined(_MSC_VER)
31# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
32#else
33# define DEPRECATED(func, hint) func
34#endif
35
36#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
37
38#define LLAMA_TOKEN_NULL -1
39
40#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
41#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
42#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
43
44#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
45#define LLAMA_SESSION_VERSION 9
46
47#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
48#define LLAMA_STATE_SEQ_VERSION 2
49
50#ifdef __cplusplus
51extern "C" {
52#endif
53
54 //
55 // C interface
56 //
57 // TODO: show sample usage
58 //
59
60 struct llama_vocab;
61 struct llama_model;
62 struct llama_context;
63 struct llama_sampler;
64
65 typedef struct llama_memory_i * llama_memory_t;
66
67 typedef int32_t llama_pos;
68 typedef int32_t llama_token;
69 typedef int32_t llama_seq_id;
70
71 enum llama_vocab_type {
72 LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
73 LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
74 LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
75 LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
76 LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
77 LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
78 LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
79 };
80
81 enum llama_rope_type {
82 LLAMA_ROPE_TYPE_NONE = -1,
83 LLAMA_ROPE_TYPE_NORM = 0,
84 LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
85 LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
86 LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE,
87 LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
88 };
89
90 enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
91 LLAMA_TOKEN_TYPE_UNDEFINED = 0,
92 LLAMA_TOKEN_TYPE_NORMAL = 1,
93 LLAMA_TOKEN_TYPE_UNKNOWN = 2,
94 LLAMA_TOKEN_TYPE_CONTROL = 3,
95 LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
96 LLAMA_TOKEN_TYPE_UNUSED = 5,
97 LLAMA_TOKEN_TYPE_BYTE = 6,
98 };
99
100 enum llama_token_attr {
101 LLAMA_TOKEN_ATTR_UNDEFINED = 0,
102 LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0,
103 LLAMA_TOKEN_ATTR_UNUSED = 1 << 1,
104 LLAMA_TOKEN_ATTR_NORMAL = 1 << 2,
105 LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL?
106 LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
107 LLAMA_TOKEN_ATTR_BYTE = 1 << 5,
108 LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6,
109 LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7,
110 LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8,
111 LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9,
112 };
113
114 // model file types
115 enum llama_ftype {
116 LLAMA_FTYPE_ALL_F32 = 0,
117 LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
118 LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
119 LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
120 // LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
121 // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
122 // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
123 LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
124 LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
125 LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
126 LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
127 LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
128 LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
129 LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
130 LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
131 LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
132 LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
133 LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
134 LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
135 LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
136 LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
137 LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
138 LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
139 LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
140 LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
141 LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
142 LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
143 LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
144 LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
145 LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
146 LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
147 LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
148 LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
149 //LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
150 //LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
151 //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
152 LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
153 LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
154 LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
155
156 LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
157 };
158
159 enum llama_rope_scaling_type {
160 LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
161 LLAMA_ROPE_SCALING_TYPE_NONE = 0,
162 LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
163 LLAMA_ROPE_SCALING_TYPE_YARN = 2,
164 LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
165 LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
166 };
167
168 enum llama_pooling_type {
169 LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
170 LLAMA_POOLING_TYPE_NONE = 0,
171 LLAMA_POOLING_TYPE_MEAN = 1,
172 LLAMA_POOLING_TYPE_CLS = 2,
173 LLAMA_POOLING_TYPE_LAST = 3,
174 LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
175 };
176
177 enum llama_attention_type {
178 LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
179 LLAMA_ATTENTION_TYPE_CAUSAL = 0,
180 LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
181 };
182
183 enum llama_flash_attn_type {
184 LLAMA_FLASH_ATTN_TYPE_AUTO = -1,
185 LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
186 LLAMA_FLASH_ATTN_TYPE_ENABLED = 1,
187 };
188
189 LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
190
191 enum llama_split_mode {
192 LLAMA_SPLIT_MODE_NONE = 0, // single GPU
193 LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
194 LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
195 };
196
197 // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
198 typedef struct llama_token_data {
199 llama_token id; // token id
200 float logit; // log-odds of the token
201 float p; // probability of the token
202 } llama_token_data;
203
204 typedef struct llama_token_data_array {
205 // TODO: consider SoA
206 // NOTE: this pointer can be modified by the samplers
207 llama_token_data * data;
208 size_t size;
209 int64_t selected; // this is the index in the data array (i.e. not the token id)
210 bool sorted; // note: do not assume the data is sorted - always check this flag
211 } llama_token_data_array;
212
213 typedef bool (*llama_progress_callback)(float progress, void * user_data);
214
215 // Input data for llama_encode/llama_decode
216 // A llama_batch object can contain input about one or many sequences
217 // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
218 //
219 // - token : the token ids of the input (used when embd is NULL)
220 // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
221 // - pos : the positions of the respective token in the sequence
222 // (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
223 // - seq_id : the sequence to which the respective token belongs
224 // (if set to NULL, the sequence ID will be assumed to be 0)
225 // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
226 // (if set to NULL:
227 // - if embeddings: all tokens are output
228 // - if not: only the last token is output
229 // )
230 //
231 typedef struct llama_batch {
232 int32_t n_tokens;
233
234 llama_token * token;
235 float * embd;
236 llama_pos * pos;
237 int32_t * n_seq_id;
238 llama_seq_id ** seq_id;
239 int8_t * logits; // TODO: rename this to "output"
240 } llama_batch;
241
242 enum llama_model_kv_override_type {
243 LLAMA_KV_OVERRIDE_TYPE_INT,
244 LLAMA_KV_OVERRIDE_TYPE_FLOAT,
245 LLAMA_KV_OVERRIDE_TYPE_BOOL,
246 LLAMA_KV_OVERRIDE_TYPE_STR,
247 };
248
249 enum llama_model_meta_key {
250 LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
251 LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
252 LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
253 LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
254 LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
255 LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
256 LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
257 LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
258 LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
259 LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
260 LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
261 LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
262 };
263
264 struct llama_model_kv_override {
265 enum llama_model_kv_override_type tag;
266
267 char key[128];
268
269 union {
270 int64_t val_i64;
271 double val_f64;
272 bool val_bool;
273 char val_str[128];
274 };
275 };
276
277 struct llama_model_tensor_buft_override {
278 const char * pattern;
279 ggml_backend_buffer_type_t buft;
280 };
281
282 struct llama_model_params {
283 // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
284 ggml_backend_dev_t * devices;
285
286 // NULL-terminated list of buffer types to use for tensors that match a pattern
287 const struct llama_model_tensor_buft_override * tensor_buft_overrides;
288
289 int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
290 enum llama_split_mode split_mode; // how to split the model across multiple GPUs
291
292 // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
293 int32_t main_gpu;
294
295 // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
296 const float * tensor_split;
297
298 // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
299 // If the provided progress_callback returns true, model loading continues.
300 // If it returns false, model loading is immediately aborted.
301 llama_progress_callback progress_callback;
302
303 // context pointer passed to the progress callback
304 void * progress_callback_user_data;
305
306 // override key-value pairs of the model meta data
307 const struct llama_model_kv_override * kv_overrides;
308
309 // Keep the booleans together to avoid misalignment during copy-by-value.
310 bool vocab_only; // only load the vocabulary, no weights
311 bool use_mmap; // use mmap if possible
312 bool use_direct_io; // use direct io, takes precedence over use_mmap when supported
313 bool use_mlock; // force system to keep model in RAM
314 bool check_tensors; // validate model tensor data
315 bool use_extra_bufts; // use extra buffer types (used for weight repacking)
316 bool no_host; // bypass host buffer allowing extra buffers to be used
317 bool no_alloc; // only load metadata and simulate memory allocations
318 };
319
320 struct llama_sampler_seq_config {
321 llama_seq_id seq_id;
322 struct llama_sampler * sampler;
323 };
324
325 // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
326 // https://github.com/ggml-org/llama.cpp/pull/7544
327 struct llama_context_params {
328 uint32_t n_ctx; // text context, 0 = from model
329 uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
330 uint32_t n_ubatch; // physical maximum batch size
331 uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
332 int32_t n_threads; // number of threads to use for generation
333 int32_t n_threads_batch; // number of threads to use for batch processing
334
335 enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
336 enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
337 enum llama_attention_type attention_type; // attention type to use for embeddings
338 enum llama_flash_attn_type flash_attn_type; // when to enable Flash Attention
339
340 // ref: https://github.com/ggml-org/llama.cpp/pull/2054
341 float rope_freq_base; // RoPE base frequency, 0 = from model
342 float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
343 float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
344 float yarn_attn_factor; // YaRN magnitude scaling factor
345 float yarn_beta_fast; // YaRN low correction dim
346 float yarn_beta_slow; // YaRN high correction dim
347 uint32_t yarn_orig_ctx; // YaRN original context size
348 float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
349
350 ggml_backend_sched_eval_callback cb_eval;
351 void * cb_eval_user_data;
352
353 enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
354 enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
355
356 // Abort callback
357 // if it returns true, execution of llama_decode() will be aborted
358 // currently works only with CPU execution
359 ggml_abort_callback abort_callback;
360 void * abort_callback_data;
361
362 // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
363 bool embeddings; // if true, extract embeddings (together with logits)
364 bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
365 bool no_perf; // measure performance timings
366 bool op_offload; // offload host tensor operations to device
367 bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
368 // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
369 // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
370 bool kv_unified; // use a unified buffer across the input sequences when computing the attention
371 // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
372 // ref: https://github.com/ggml-org/llama.cpp/pull/14363
373
374 // [EXPERIMENTAL]
375 // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
376 // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
377 struct llama_sampler_seq_config * samplers;
378 size_t n_samplers;
379 };
380
381 // model quantization parameters
382 typedef struct llama_model_quantize_params {
383 int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
384 enum llama_ftype ftype; // quantize to this llama_ftype
385 enum ggml_type output_tensor_type; // output tensor type
386 enum ggml_type token_embedding_type; // token embeddings tensor type
387 bool allow_requantize; // allow quantizing non-f32/f16 tensors
388 bool quantize_output_tensor; // quantize output.weight
389 bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
390 bool pure; // quantize all tensors to the default type
391 bool keep_split; // quantize to the same number of shards
392 void * imatrix; // pointer to importance matrix data
393 void * kv_overrides; // pointer to vector containing overrides
394 void * tensor_types; // pointer to vector containing tensor types
395 void * prune_layers; // pointer to vector containing layer indices to prune
396 } llama_model_quantize_params;
397
398 typedef struct llama_logit_bias {
399 llama_token token;
400 float bias;
401 } llama_logit_bias;
402
403 typedef struct llama_sampler_chain_params {
404 bool no_perf; // whether to measure performance timings
405 } llama_sampler_chain_params;
406
407 // used in chat template
408 typedef struct llama_chat_message {
409 const char * role;
410 const char * content;
411 } llama_chat_message;
412
413 // lora adapter
414 struct llama_adapter_lora;
415
416 // Helpers for getting default parameters
417 // TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172)
418 LLAMA_API struct llama_model_params llama_model_default_params(void);
419 LLAMA_API struct llama_context_params llama_context_default_params(void);
420 LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
421 LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
422
423 // Initialize the llama + ggml backend
424 // If numa is true, use NUMA optimizations
425 // Call once at the start of the program
426 LLAMA_API void llama_backend_init(void);
427
428 // Call once at the end of the program - currently only used for MPI
429 LLAMA_API void llama_backend_free(void);
430
431 //optional:
432 LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
433
434 // Optional: an auto threadpool gets created in ggml if not passed explicitly
435 LLAMA_API void llama_attach_threadpool(
436 struct llama_context * ctx,
437 ggml_threadpool_t threadpool,
438 ggml_threadpool_t threadpool_batch);
439
440 LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
441
442 DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
443 const char * path_model,
444 struct llama_model_params params),
445 "use llama_model_load_from_file instead");
446
447 // Load the model from a file
448 // If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
449 // If the split file name does not follow this pattern, use llama_model_load_from_splits
450 LLAMA_API struct llama_model * llama_model_load_from_file(
451 const char * path_model,
452 struct llama_model_params params);
453
454 // Load the model from multiple splits (support custom naming scheme)
455 // The paths must be in the correct order
456 LLAMA_API struct llama_model * llama_model_load_from_splits(
457 const char ** paths,
458 size_t n_paths,
459 struct llama_model_params params);
460
461 LLAMA_API void llama_model_save_to_file(
462 const struct llama_model * model,
463 const char * path_model);
464
465 DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
466 "use llama_model_free instead");
467
468 LLAMA_API void llama_model_free(struct llama_model * model);
469
470 LLAMA_API struct llama_context * llama_init_from_model(
471 struct llama_model * model,
472 struct llama_context_params params);
473
474 DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
475 struct llama_model * model,
476 struct llama_context_params params),
477 "use llama_init_from_model instead");
478
479 // Frees all allocated memory
480 LLAMA_API void llama_free(struct llama_context * ctx);
481
482 enum llama_params_fit_status {
483 LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
484 LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
485 LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path
486 };
487
488 // fits mparams and cparams to free device memory (assumes system memory is unlimited)
489 // - returns true if the parameters could be successfully modified to fit device memory
490 // - this function is NOT thread safe because it modifies the global llama logger state
491 // - only parameters that have the same value as in llama_default_model_params are modified
492 // with the exception of the context size which is modified if and only if equal to 0
493 LLAMA_API enum llama_params_fit_status llama_params_fit(
494 const char * path_model,
495 struct llama_model_params * mparams,
496 struct llama_context_params * cparams,
497 float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements
498 struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
499 size_t * margins, // margins of memory to leave per device in bytes
500 uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use
501 enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log
502
503 LLAMA_API int64_t llama_time_us(void);
504
505 LLAMA_API size_t llama_max_devices(void);
506 LLAMA_API size_t llama_max_parallel_sequences(void);
507 LLAMA_API size_t llama_max_tensor_buft_overrides(void);
508
509 LLAMA_API bool llama_supports_mmap (void);
510 LLAMA_API bool llama_supports_mlock (void);
511 LLAMA_API bool llama_supports_gpu_offload(void);
512 LLAMA_API bool llama_supports_rpc (void);
513
514 // NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
515 // In some cases the requested values via llama_context_params may differ from the actual values used by the context
516 // ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
517 LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
518 LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx);
519 LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
520 LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
521 LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
522
523 DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
524 DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead");
525 DEPRECATED(LLAMA_API int32_t llama_n_layer (const struct llama_model * model), "use llama_model_n_layer instead");
526 DEPRECATED(LLAMA_API int32_t llama_n_head (const struct llama_model * model), "use llama_model_n_head instead");
527
528 DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
529
530 LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
531 LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
532 LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
533
534 LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
535 LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
536
537 LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
538 LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
539 LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
540 LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
541 LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
542 LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
543 LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
544 LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
545
546 // Get the model's RoPE frequency scaling factor
547 LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
548
549 // Returns the number of classifier outputs (only valid for classifier models)
550 // Undefined behavior for non-classifier models
551 LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
552
553 // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
554 LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
555
556 LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
557
558 LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
559
560 // Functions to access the model's GGUF metadata scalar values
561 // - The functions return the length of the string on success, or -1 on failure
562 // - The output string is always null-terminated and cleared on failure
563 // - When retrieving a string, an extra byte must be allocated to account for the null terminator
564 // - GGUF array values are not supported by these functions
565
566 // Get metadata value as a string by key name
567 LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
568
569 // Get the number of metadata key/value pairs
570 LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
571
572 // Get sampling metadata key name. Returns nullptr if the key is invalid
573 LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
574
575 // Get metadata key name by index
576 LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
577
578 // Get metadata value as a string by index
579 LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
580
581 // Get a string describing the model type
582 LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
583
584 // Returns the total size of all the tensors in the model in bytes
585 LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
586
587 // Get the default chat template. Returns nullptr if not available
588 // If name is NULL, returns the default chat template
589 LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
590
591 // Returns the total number of parameters in the model
592 LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
593
594 // Returns true if the model contains an encoder that requires llama_encode() call
595 LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
596
597 // Returns true if the model contains a decoder that requires llama_decode() call
598 LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
599
600 // For encoder-decoder models, this function returns id of the token that must be provided
601 // to the decoder to start generating output sequence. For other models, it returns -1.
602 LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
603
604 // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
605 LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
606
607 // Returns true if the model is hybrid (like Jamba, Granite, etc.)
608 LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model);
609
610 // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
611 LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
612
613 // Returns 0 on success
614 LLAMA_API uint32_t llama_model_quantize(
615 const char * fname_inp,
616 const char * fname_out,
617 const llama_model_quantize_params * params);
618
619 //
620 // Adapters
621 //
622
623 // Load a LoRA adapter from file
624 // The adapter is valid as long as the associated model is not freed
625 // All adapters must be loaded before context creation
626 LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
627 struct llama_model * model,
628 const char * path_lora);
629
630 // Functions to access the adapter's GGUF metadata scalar values
631 // - The functions return the length of the string on success, or -1 on failure
632 // - The output string is always null-terminated and cleared on failure
633 // - When retrieving a string, an extra byte must be allocated to account for the null terminator
634 // - GGUF array values are not supported by these functions
635
636 // Get metadata value as a string by key name
637 LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
638
639 // Get the number of metadata key/value pairs
640 LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
641
642 // Get metadata key name by index
643 LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
644
645 // Get metadata value as a string by index
646 LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
647
648 // Manually free a LoRA adapter
649 // NOTE: loaded adapters will be free when the associated model is deleted
650 LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
651 "adapters are now freed together with the associated model");
652
653 // Get the invocation tokens if the current lora is an alora
654 LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
655 LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens (const struct llama_adapter_lora * adapter);
656
657 // The following functions operate on a llama_context, hence the naming: llama_verb_...
658
659 // Add a loaded LoRA adapter to given context
660 // This will not modify model's weight
661 LLAMA_API int32_t llama_set_adapter_lora(
662 struct llama_context * ctx,
663 struct llama_adapter_lora * adapter,
664 float scale);
665
666 // Remove a specific LoRA adapter from given context
667 // Return -1 if the adapter is not present in the context
668 LLAMA_API int32_t llama_rm_adapter_lora(
669 struct llama_context * ctx,
670 struct llama_adapter_lora * adapter);
671
672 // Remove all LoRA adapters from given context
673 LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
674
675 // Apply a loaded control vector to a llama_context, or if data is NULL, clear
676 // the currently loaded vector.
677 // n_embd should be the size of a single layer's control, and data should point
678 // to an n_embd x n_layers buffer starting from layer 1.
679 // il_start and il_end are the layer range the vector should apply to (both inclusive)
680 // See llama_control_vector_load in common to load a control vector.
681 LLAMA_API int32_t llama_apply_adapter_cvec(
682 struct llama_context * ctx,
683 const float * data,
684 size_t len,
685 int32_t n_embd,
686 int32_t il_start,
687 int32_t il_end);
688
689 //
690 // Memory
691 //
692
693 // Clear the memory contents
694 // If data == true, the data buffers will also be cleared together with the metadata
695 LLAMA_API void llama_memory_clear(
696 llama_memory_t mem,
697 bool data);
698
699 // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
700 // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
701 // seq_id < 0 : match any sequence
702 // p0 < 0 : [0, p1]
703 // p1 < 0 : [p0, inf)
704 LLAMA_API bool llama_memory_seq_rm(
705 llama_memory_t mem,
706 llama_seq_id seq_id,
707 llama_pos p0,
708 llama_pos p1);
709
710 // Copy all tokens that belong to the specified sequence to another sequence
711 // p0 < 0 : [0, p1]
712 // p1 < 0 : [p0, inf)
713 LLAMA_API void llama_memory_seq_cp(
714 llama_memory_t mem,
715 llama_seq_id seq_id_src,
716 llama_seq_id seq_id_dst,
717 llama_pos p0,
718 llama_pos p1);
719
720 // Removes all tokens that do not belong to the specified sequence
721 LLAMA_API void llama_memory_seq_keep(
722 llama_memory_t mem,
723 llama_seq_id seq_id);
724
725 // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
726 // p0 < 0 : [0, p1]
727 // p1 < 0 : [p0, inf)
728 LLAMA_API void llama_memory_seq_add(
729 llama_memory_t mem,
730 llama_seq_id seq_id,
731 llama_pos p0,
732 llama_pos p1,
733 llama_pos delta);
734
735 // Integer division of the positions by factor of `d > 1`
736 // p0 < 0 : [0, p1]
737 // p1 < 0 : [p0, inf)
738 LLAMA_API void llama_memory_seq_div(
739 llama_memory_t mem,
740 llama_seq_id seq_id,
741 llama_pos p0,
742 llama_pos p1,
743 int d);
744
745 // Returns the smallest position present in the memory for the specified sequence
746 // This is typically non-zero only for SWA caches
747 // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
748 // Return -1 if the sequence is empty
749 LLAMA_API llama_pos llama_memory_seq_pos_min(
750 llama_memory_t mem,
751 llama_seq_id seq_id);
752
753 // Returns the largest position present in the memory for the specified sequence
754 // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
755 // Return -1 if the sequence is empty
756 LLAMA_API llama_pos llama_memory_seq_pos_max(
757 llama_memory_t mem,
758 llama_seq_id seq_id);
759
760 // Check if the memory supports shifting
761 LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
762
763 //
764 // State / sessions
765 //
766
767 // Returns the *actual* size in bytes of the state
768 // (logits, embedding and memory)
769 // Only use when saving the state, not when restoring it, otherwise the size may be too small.
770 LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
771 LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
772 "use llama_state_get_size instead");
773
774 // Copies the state to the specified destination address.
775 // Destination needs to have allocated enough memory.
776 // Returns the number of bytes copied
777 LLAMA_API size_t llama_state_get_data(
778 struct llama_context * ctx,
779 uint8_t * dst,
780 size_t size);
781 LLAMA_API DEPRECATED(size_t llama_copy_state_data(
782 struct llama_context * ctx,
783 uint8_t * dst),
784 "use llama_state_get_data instead");
785
786 // Set the state reading from the specified address
787 // Returns the number of bytes read
788 LLAMA_API size_t llama_state_set_data(
789 struct llama_context * ctx,
790 const uint8_t * src,
791 size_t size);
792 LLAMA_API DEPRECATED(size_t llama_set_state_data(
793 struct llama_context * ctx,
794 const uint8_t * src),
795 "use llama_state_set_data instead");
796
797 // Save/load session file
798 LLAMA_API bool llama_state_load_file(
799 struct llama_context * ctx,
800 const char * path_session,
801 llama_token * tokens_out,
802 size_t n_token_capacity,
803 size_t * n_token_count_out);
804 LLAMA_API DEPRECATED(bool llama_load_session_file(
805 struct llama_context * ctx,
806 const char * path_session,
807 llama_token * tokens_out,
808 size_t n_token_capacity,
809 size_t * n_token_count_out),
810 "use llama_state_load_file instead");
811
812 LLAMA_API bool llama_state_save_file(
813 struct llama_context * ctx,
814 const char * path_session,
815 const llama_token * tokens,
816 size_t n_token_count);
817 LLAMA_API DEPRECATED(bool llama_save_session_file(
818 struct llama_context * ctx,
819 const char * path_session,
820 const llama_token * tokens,
821 size_t n_token_count),
822 "use llama_state_save_file instead");
823
824 // Get the exact size needed to copy the state of a single sequence
825 LLAMA_API size_t llama_state_seq_get_size(
826 struct llama_context * ctx,
827 llama_seq_id seq_id);
828
829 // Copy the state of a single sequence into the specified buffer
830 LLAMA_API size_t llama_state_seq_get_data(
831 struct llama_context * ctx,
832 uint8_t * dst,
833 size_t size,
834 llama_seq_id seq_id);
835
836 // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
837 // Returns:
838 // - Positive: Ok
839 // - Zero: Failed to load
840 LLAMA_API size_t llama_state_seq_set_data(
841 struct llama_context * ctx,
842 const uint8_t * src,
843 size_t size,
844 llama_seq_id dest_seq_id);
845
846 LLAMA_API size_t llama_state_seq_save_file(
847 struct llama_context * ctx,
848 const char * filepath,
849 llama_seq_id seq_id,
850 const llama_token * tokens,
851 size_t n_token_count);
852
853 LLAMA_API size_t llama_state_seq_load_file(
854 struct llama_context * ctx,
855 const char * filepath,
856 llama_seq_id dest_seq_id,
857 llama_token * tokens_out,
858 size_t n_token_capacity,
859 size_t * n_token_count_out);
860
861// for backwards-compat
862#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
863
864// work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
865#define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
866
867 typedef uint32_t llama_state_seq_flags;
868
869 LLAMA_API size_t llama_state_seq_get_size_ext(
870 struct llama_context * ctx,
871 llama_seq_id seq_id,
872 llama_state_seq_flags flags);
873
874 LLAMA_API size_t llama_state_seq_get_data_ext(
875 struct llama_context * ctx,
876 uint8_t * dst,
877 size_t size,
878 llama_seq_id seq_id,
879 llama_state_seq_flags flags);
880
881 LLAMA_API size_t llama_state_seq_set_data_ext(
882 struct llama_context * ctx,
883 const uint8_t * src,
884 size_t size,
885 llama_seq_id dest_seq_id,
886 llama_state_seq_flags flags);
887
888 //
889 // Decoding
890 //
891
892 // Return batch for single sequence of tokens
893 // The sequence ID will be fixed to 0
894 // The position of the tokens will be tracked automatically by llama_decode
895 //
896 // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
897 //
898 LLAMA_API struct llama_batch llama_batch_get_one(
899 llama_token * tokens,
900 int32_t n_tokens);
901
902 // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
903 // Each token can be assigned up to n_seq_max sequence ids
904 // The batch has to be freed with llama_batch_free()
905 // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
906 // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
907 // The rest of the llama_batch members are allocated with size n_tokens
908 // All members are left uninitialized
909 LLAMA_API struct llama_batch llama_batch_init(
910 int32_t n_tokens,
911 int32_t embd,
912 int32_t n_seq_max);
913
914 // Frees a batch of tokens allocated with llama_batch_init()
915 LLAMA_API void llama_batch_free(struct llama_batch batch);
916
917 // Process a batch of tokens.
918 // In contrast to llama_decode() - this call does not use KV cache.
919 // For encode-decoder contexts, processes the batch using the encoder.
920 // Can store the encoder output internally for later use by the decoder's cross-attention layers.
921 // 0 - success
922 // < 0 - error. the memory state is restored to the state before this call
923 LLAMA_API int32_t llama_encode(
924 struct llama_context * ctx,
925 struct llama_batch batch);
926
927 // Process a batch of tokens.
928 // Requires the context to have a memory.
929 // For encode-decoder contexts, processes the batch using the decoder.
930 // Positive return values does not mean a fatal error, but rather a warning.
931 // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
932 // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
933 // Upon other return values, the memory state is restored to the state before this call
934 // 0 - success
935 // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
936 // 2 - aborted (processed ubatches will remain in the context's memory)
937 // -1 - invalid input batch
938 // < -1 - fatal error (processed ubatches will remain in the context's memory)
939 LLAMA_API int32_t llama_decode(
940 struct llama_context * ctx,
941 struct llama_batch batch);
942
943 // Set the number of threads used for decoding
944 // n_threads is the number of threads used for generation (single token)
945 // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
946 LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
947
948 // Get the number of threads used for generation of a single token.
949 LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
950
951 // Get the number of threads used for prompt and batch processing (multiple token).
952 LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
953
954 // Set whether the context outputs embeddings or not
955 // TODO: rename to avoid confusion with llama_get_embeddings()
956 LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
957
958 // Set whether to use causal attention or not
959 // If set to true, the model will only attend to the past tokens
960 LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
961
962 // Set whether the model is in warmup mode or not
963 // If true, all model tensors are activated during llama_decode() to load and cache their weights.
964 LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
965
966 // Set abort callback
967 LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
968
969 // Wait until all computations are finished
970 // This is automatically done when using one of the functions below to obtain the computation results
971 // and is not necessary to call it explicitly in most cases
972 LLAMA_API void llama_synchronize(struct llama_context * ctx);
973
974 // Token logits obtained from the last call to llama_decode()
975 // The logits for which llama_batch.logits[i] != 0 are stored contiguously
976 // in the order they have appeared in the batch.
977 // Rows: number of tokens for which llama_batch.logits[i] != 0
978 // Cols: n_vocab
979 // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
980 LLAMA_API float * llama_get_logits(struct llama_context * ctx);
981
982 // Logits for the ith token. For positive indices, Equivalent to:
983 // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
984 // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
985 // returns NULL for invalid ids.
986 LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
987
988 // Get all output token embeddings.
989 // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
990 // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
991 // in the order they have appeared in the batch.
992 // shape: [n_outputs*n_embd]
993 // Otherwise, returns NULL.
994 // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
995 LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
996
997 // Get the embeddings for the ith token. For positive indices, Equivalent to:
998 // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
999 // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
1000 // shape: [n_embd] (1-dimensional)
1001 // returns NULL for invalid ids.
1002 LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
1003
1004 // Get the embeddings for a sequence id
1005 // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
1006 // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
1007 // otherwise: float[n_embd] (1-dimensional)
1008 LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
1009
1010 //
1011 // backend sampling API [EXPERIMENTAL]
1012 // note: use only if the llama_context was created with at least one llama_sampler_seq_config
1013 //
1014
1015 // Get the backend sampled token for the ith token.
1016 // Returns LLAMA_TOKEN_NULL if no token was sampled.
1017 LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
1018
1019 // Get the backend sampled probabilites for the ith token
1020 // The index matches llama_get_sampled_token_ith().
1021 // Returns NULL if no probabilites were generated.
1022 LLAMA_API float * llama_get_sampled_probs_ith (struct llama_context * ctx, int32_t i);
1023 LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
1024
1025 // Get the backend sampled logits for the ith token
1026 // Returns NULL if no logits were sampled.
1027 LLAMA_API float * llama_get_sampled_logits_ith (struct llama_context * ctx, int32_t i);
1028 LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
1029
1030 // Get the backend sampled candidates (token ids) for the ith token
1031 // These are needed to map probability/logit indices to vocab token ids.
1032 // Returns NULL if no candidates were sampled.
1033 LLAMA_API llama_token * llama_get_sampled_candidates_ith (struct llama_context * ctx, int32_t i);
1034 LLAMA_API uint32_t llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
1035
1036 //
1037 // Vocab
1038 //
1039
1040 LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
1041
1042 LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
1043
1044 LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
1045
1046 // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
1047 LLAMA_API bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token);
1048
1049 // Identify if Token Id is a control token or a render-able token
1050 LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
1051
1052 // Special tokens
1053 LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
1054 LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
1055 LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
1056 LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
1057 LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
1058 LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
1059 LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
1060
1061 LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1062 LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1063 LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
1064
1065 LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
1066 LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
1067 LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
1068 LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
1069 LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
1070 LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
1071
1072 DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead");
1073 DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
1074 DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
1075 DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
1076 DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
1077 DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
1078 DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
1079 DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
1080 DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
1081 DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
1082 DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
1083 DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
1084 DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
1085 DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
1086 DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
1087 DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
1088 DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
1089 DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
1090 DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
1091 DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
1092
1093 // CLS is equivalent to BOS
1094 DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
1095 "use llama_vocab_bos instead");
1096
1097 //
1098 // Tokenization
1099 //
1100 // The API is thread-safe.
1101 //
1102
1103 /// @details Convert the provided text into tokens.
1104 /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
1105 /// @return Returns the number of tokens on success, no more than n_tokens_max
1106 /// @return Returns a negative number on failure - the number of tokens that would have been returned
1107 /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
1108 /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
1109 /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
1110 /// as plaintext. Does not insert a leading space.
1111 LLAMA_API int32_t llama_tokenize(
1112 const struct llama_vocab * vocab,
1113 const char * text,
1114 int32_t text_len,
1115 llama_token * tokens,
1116 int32_t n_tokens_max,
1117 bool add_special,
1118 bool parse_special);
1119
1120 // Token Id -> Piece.
1121 // Uses the vocabulary in the provided context.
1122 // Does not write null terminator to the buffer.
1123 // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
1124 // @param special If true, special tokens are rendered in the output.
1125 LLAMA_API int32_t llama_token_to_piece(
1126 const struct llama_vocab * vocab,
1127 llama_token token,
1128 char * buf,
1129 int32_t length,
1130 int32_t lstrip,
1131 bool special);
1132
1133 /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
1134 /// @param text The char pointer must be large enough to hold the resulting text.
1135 /// @return Returns the number of chars/bytes on success, no more than text_len_max.
1136 /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
1137 /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
1138 /// @param unparse_special If true, special tokens are rendered in the output.
1139 LLAMA_API int32_t llama_detokenize(
1140 const struct llama_vocab * vocab,
1141 const llama_token * tokens,
1142 int32_t n_tokens,
1143 char * text,
1144 int32_t text_len_max,
1145 bool remove_special,
1146 bool unparse_special);
1147
1148 //
1149 // Chat templates
1150 //
1151
1152 /// Apply chat template. Inspired by hf apply_chat_template() on python.
1153 /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
1154 /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
1155 /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
1156 /// @param chat Pointer to a list of multiple llama_chat_message
1157 /// @param n_msg Number of llama_chat_message in this chat
1158 /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
1159 /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
1160 /// @param length The size of the allocated buffer
1161 /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
1162 LLAMA_API int32_t llama_chat_apply_template(
1163 const char * tmpl,
1164 const struct llama_chat_message * chat,
1165 size_t n_msg,
1166 bool add_ass,
1167 char * buf,
1168 int32_t length);
1169
1170 // Get list of built-in chat templates
1171 LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
1172
1173 //
1174 // Sampling API
1175 //
1176 // Sample usage:
1177 //
1178 // // prepare the sampling chain at the start
1179 // auto sparams = llama_sampler_chain_default_params();
1180 //
1181 // llama_sampler * smpl = llama_sampler_chain_init(sparams);
1182 //
1183 // llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
1184 // llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
1185 // llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8));
1186 //
1187 // // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
1188 // // this sampler will be responsible to select the actual token
1189 // llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed));
1190 //
1191 // ...
1192 //
1193 // // decoding loop:
1194 // while (...) {
1195 // ...
1196 //
1197 // llama_decode(ctx, batch);
1198 //
1199 // // sample from the logits of the last token in the batch
1200 // const llama_token id = llama_sampler_sample(smpl, ctx, -1);
1201 //
1202 // ...
1203 // }
1204 //
1205 // llama_sampler_free(smpl);
1206 //
1207
1208 typedef void * llama_sampler_context_t;
1209
1210 struct llama_sampler_data {
1211 struct ggml_tensor * logits;
1212 struct ggml_tensor * probs;
1213 struct ggml_tensor * sampled;
1214 struct ggml_tensor * candidates;
1215 };
1216
1217 // user code can implement the interface below in order to create custom llama_sampler
1218 struct llama_sampler_i {
1219 const char * (*name) (const struct llama_sampler * smpl); // can be NULL
1220 void (*accept)( struct llama_sampler * smpl, llama_token token); // can be NULL
1221 void (*apply) ( struct llama_sampler * smpl, llama_token_data_array * cur_p); // required
1222 void (*reset) ( struct llama_sampler * smpl); // can be NULL
1223 struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
1224 void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
1225
1226 // [EXPERIMENTAL]
1227 // backend sampling interface:
1228
1229 // return true if the backend supports all ops needed by the sampler
1230 // note: call once per sampler
1231 bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
1232
1233 // call after .backend_apply()
1234 void (*backend_accept)(
1235 struct llama_sampler * smpl,
1236 struct ggml_context * ctx,
1237 struct ggml_cgraph * gf,
1238 struct ggml_tensor * selected_token);
1239
1240 // call after .backend_init()
1241 void (*backend_apply)(
1242 struct llama_sampler * smpl,
1243 struct ggml_context * ctx,
1244 struct ggml_cgraph * gf,
1245 struct llama_sampler_data * data);
1246
1247 // called before graph execution to set inputs for the current ubatch
1248 void (*backend_set_input)(struct llama_sampler * smpl);
1249 };
1250
1251 struct llama_sampler {
1252 struct llama_sampler_i * iface;
1253
1254 llama_sampler_context_t ctx;
1255 };
1256
1257 // [EXPERIMENTAL]
1258 // attach a sampler to the context
1259 // note: prefer initializing the context with llama_context_params.samplers when possible
1260 LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
1261
1262 // mirror of llama_sampler_i:
1263 LLAMA_API struct llama_sampler * llama_sampler_init ( struct llama_sampler_i * iface, llama_sampler_context_t ctx);
1264 LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
1265 LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
1266 LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
1267 LLAMA_API void llama_sampler_reset ( struct llama_sampler * smpl);
1268 LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
1269 // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
1270 LLAMA_API void llama_sampler_free ( struct llama_sampler * smpl);
1271
1272 // llama_sampler_chain
1273 // a type of llama_sampler that can chain multiple samplers one after another
1274
1275 LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
1276
1277 // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
1278 LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
1279
1280 // return NULL if:
1281 // - the sampler is NULL
1282 // - the sampler is not a llama_sampler_chain
1283 // - the index is out of bounds, unless i == -1
1284 // - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
1285 LLAMA_API struct llama_sampler * llama_sampler_chain_get( struct llama_sampler * chain, int32_t i);
1286
1287 // the total number of samplers in the chain
1288 LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
1289
1290 // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
1291 LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i);
1292
1293 // available samplers:
1294
1295 LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
1296
1297 /// seed == LLAMA_DEFAULT_SEED to use a random seed.
1298 LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
1299
1300 /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1301 /// Setting k <= 0 makes this a noop
1302 LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
1303
1304 /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1305 LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep);
1306
1307 /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
1308 LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
1309
1310 /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1311 LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
1312
1313 /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
1314 LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
1315
1316 /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
1317 LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
1318
1319 /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
1320 LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
1321
1322 /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
1323 LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float n);
1324
1325 /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1326 /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1327 /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1328 /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1329 /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
1330 /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1331 LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
1332 int32_t n_vocab,
1333 uint32_t seed,
1334 float tau,
1335 float eta,
1336 int32_t m);
1337
1338 /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1339 /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1340 /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1341 /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1342 /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1343 LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
1344 uint32_t seed,
1345 float tau,
1346 float eta);
1347
1348 /// @details Intializes a GBNF grammar, see grammars/README.md for details.
1349 /// @param vocab The vocabulary that this grammar will be used with.
1350 /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
1351 /// @param grammar_root The name of the start symbol for the grammar.
1352 LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
1353 const struct llama_vocab * vocab,
1354 const char * grammar_str,
1355 const char * grammar_root);
1356
1357 DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
1358 const struct llama_vocab * vocab,
1359 const char * grammar_str,
1360 const char * grammar_root,
1361 const char ** trigger_words,
1362 size_t num_trigger_words,
1363 const llama_token * trigger_tokens,
1364 size_t num_trigger_tokens),
1365 "use llama_sampler_init_grammar_lazy_patterns instead");
1366
1367
1368 /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
1369 /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
1370 /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
1371 LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
1372 const struct llama_vocab * vocab,
1373 const char * grammar_str,
1374 const char * grammar_root,
1375 const char ** trigger_patterns,
1376 size_t num_trigger_patterns,
1377 const llama_token * trigger_tokens,
1378 size_t num_trigger_tokens);
1379
1380
1381 /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
1382 LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
1383 int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
1384 float penalty_repeat, // 1.0 = disabled
1385 float penalty_freq, // 0.0 = disabled
1386 float penalty_present); // 0.0 = disabled
1387
1388 /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
1389 LLAMA_API struct llama_sampler * llama_sampler_init_dry(
1390 const struct llama_vocab * vocab,
1391 int32_t n_ctx_train,
1392 float dry_multiplier,
1393 float dry_base,
1394 int32_t dry_allowed_length,
1395 int32_t dry_penalty_last_n,
1396 const char ** seq_breakers,
1397 size_t num_breakers);
1398
1399 /// adaptive-p: select tokens near a configurable target probability over time.
1400 ///
1401 /// the adaptive-p sampler transforms the token probability distribution to favor tokens
1402 /// that fall near a user-configurable probability target.
1403 ///
1404 /// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
1405 /// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
1406 /// adapted target probability at each sampling step, thus maintaining the desired target
1407 /// probability over time.
1408 ///
1409 /// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
1410 /// in the sampler chain (like mirostat, dist, greedy).
1411 ///
1412 /// only mild truncation before this sampler is recommended. we suggest applying min-p
1413 /// before adaptive-p as the only other active sampler in the chain.
1414 ///
1415 /// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
1416 /// @param decay EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
1417 /// @param seed RNG seed
1418 ///
1419 /// ref: https://github.com/ggml-org/llama.cpp/pull/17927
1420 ///
1421 LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
1422 float target,
1423 float decay,
1424 uint32_t seed);
1425
1426 LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
1427 int32_t n_vocab,
1428 int32_t n_logit_bias,
1429 const llama_logit_bias * logit_bias);
1430
1431 // this sampler is meant to be used for fill-in-the-middle infilling
1432 // it's supposed to be used after top_k + top_p sampling
1433 //
1434 // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
1435 // 2. combine probs of tokens that have the same prefix
1436 //
1437 // example:
1438 //
1439 // - before:
1440 // "hel": 0.5
1441 // "hell": 0.2
1442 // "hello": 0.1
1443 // "dummy": 0.1
1444 //
1445 // - after:
1446 // "hel": 0.8
1447 // "dummy": 0.1
1448 //
1449 // 3. discard non-EOG tokens with low prob
1450 // 4. if no tokens are left -> pick EOT
1451 //
1452 LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
1453
1454 // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
1455 LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
1456
1457 /// @details Sample and accept a token from the idx-th output of the last evaluation
1458 //
1459 // Shorthand for:
1460 // const auto * logits = llama_get_logits_ith(ctx, idx);
1461 // llama_token_data_array cur_p = { ... init from logits ... };
1462 // llama_sampler_apply(smpl, &cur_p);
1463 // auto token = cur_p.data[cur_p.selected].id;
1464 // llama_sampler_accept(smpl, token);
1465 // return token;
1466 // Returns the sampled token
1467 LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
1468
1469 // TODO: extend in the future
1470 //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
1471
1472 //
1473 // Model split
1474 //
1475
1476 /// @details Build a split GGUF final path for this chunk.
1477 /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
1478 // Returns the split_path length.
1479 LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count);
1480
1481 /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
1482 /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
1483 // Returns the split_prefix length.
1484 LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count);
1485
1486 // Print system information
1487 LLAMA_API const char * llama_print_system_info(void);
1488
1489 // Set callback for all future logging events.
1490 // If this is not called, or NULL is supplied, everything is output on stderr.
1491 // The logger state is global so these functions are NOT thread safe.
1492 LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
1493 LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
1494
1495 //
1496 // Performance utils
1497 //
1498 // NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
1499 //
1500
1501 struct llama_perf_context_data {
1502 // ms == milliseconds
1503 double t_start_ms; // absolute start time
1504 double t_load_ms; // time needed for loading the model
1505 double t_p_eval_ms; // time needed for processing the prompt
1506 double t_eval_ms; // time needed for generating tokens
1507
1508 int32_t n_p_eval; // number of prompt tokens
1509 int32_t n_eval; // number of generated tokens
1510 int32_t n_reused; // number of times a ggml compute graph had been reused
1511 };
1512
1513 struct llama_perf_sampler_data {
1514 double t_sample_ms; // time needed for sampling in ms
1515
1516 int32_t n_sample; // number of sampled tokens
1517 };
1518
1519 LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
1520 LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
1521 LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
1522
1523 // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
1524 LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
1525 LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
1526 LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
1527
1528 // print a breakdown of per-device memory use via LLAMA_LOG:
1529 LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
1530
1531 //
1532 // training
1533 //
1534
1535 // function that returns whether or not a given tensor contains trainable parameters
1536 typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
1537
1538 // always returns true
1539 LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
1540
1541 struct llama_opt_params {
1542 uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
1543
1544 llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
1545 void * param_filter_ud; // userdata for determining which tensors contain trainable parameters
1546
1547 ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
1548 void * get_opt_pars_ud; // userdata for calculating optimizer parameters
1549
1550 enum ggml_opt_optimizer_type optimizer_type;
1551 };
1552
1553 LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
1554
1555 LLAMA_API void llama_opt_epoch(
1556 struct llama_context * lctx,
1557 ggml_opt_dataset_t dataset,
1558 ggml_opt_result_t result_train,
1559 ggml_opt_result_t result_eval,
1560 int64_t idata_split,
1561 ggml_opt_epoch_callback callback_train,
1562 ggml_opt_epoch_callback callback_eval);
1563
1564#ifdef __cplusplus
1565}
1566#endif
1567
1568#endif // LLAMA_H