llmnpc - llama.cpp/include/llama.h

Path: llmnpc / llama.cpp / include / llama.h (raw)
   1#ifndef LLAMA_H
   2#define LLAMA_H
   3
   4#include "ggml.h"
   5#include "ggml-cpu.h"
   6#include "ggml-backend.h"
   7#include "ggml-opt.h"
   8
   9#include <stddef.h>
  10#include <stdint.h>
  11#include <stdio.h>
  12#include <stdbool.h>
  13
  14#ifdef LLAMA_SHARED
  15#    if defined(_WIN32) && !defined(__MINGW32__)
  16#        ifdef LLAMA_BUILD
  17#            define LLAMA_API __declspec(dllexport)
  18#        else
  19#            define LLAMA_API __declspec(dllimport)
  20#        endif
  21#    else
  22#        define LLAMA_API __attribute__ ((visibility ("default")))
  23#    endif
  24#else
  25#    define LLAMA_API
  26#endif
  27
  28#ifdef __GNUC__
  29#    define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
  30#elif defined(_MSC_VER)
  31#    define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
  32#else
  33#    define DEPRECATED(func, hint) func
  34#endif
  35
  36#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
  37
  38#define LLAMA_TOKEN_NULL -1
  39
  40#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
  41#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
  42#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
  43
  44#define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
  45#define LLAMA_SESSION_VERSION 9
  46
  47#define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
  48#define LLAMA_STATE_SEQ_VERSION 2
  49
  50#ifdef __cplusplus
  51extern "C" {
  52#endif
  53
  54    //
  55    // C interface
  56    //
  57    // TODO: show sample usage
  58    //
  59
  60    struct llama_vocab;
  61    struct llama_model;
  62    struct llama_context;
  63    struct llama_sampler;
  64
  65    typedef struct llama_memory_i * llama_memory_t;
  66
  67    typedef int32_t llama_pos;
  68    typedef int32_t llama_token;
  69    typedef int32_t llama_seq_id;
  70
  71    enum llama_vocab_type {
  72        LLAMA_VOCAB_TYPE_NONE   = 0, // For models without vocab
  73        LLAMA_VOCAB_TYPE_SPM    = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
  74        LLAMA_VOCAB_TYPE_BPE    = 2, // GPT-2 tokenizer based on byte-level BPE
  75        LLAMA_VOCAB_TYPE_WPM    = 3, // BERT tokenizer based on WordPiece
  76        LLAMA_VOCAB_TYPE_UGM    = 4, // T5 tokenizer based on Unigram
  77        LLAMA_VOCAB_TYPE_RWKV   = 5, // RWKV tokenizer based on greedy tokenization
  78        LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
  79    };
  80
  81    enum llama_rope_type {
  82        LLAMA_ROPE_TYPE_NONE   = -1,
  83        LLAMA_ROPE_TYPE_NORM   = 0,
  84        LLAMA_ROPE_TYPE_NEOX   = GGML_ROPE_TYPE_NEOX,
  85        LLAMA_ROPE_TYPE_MROPE  = GGML_ROPE_TYPE_MROPE,
  86        LLAMA_ROPE_TYPE_IMROPE = GGML_ROPE_TYPE_IMROPE,
  87        LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
  88    };
  89
  90    enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
  91        LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
  92        LLAMA_TOKEN_TYPE_NORMAL       = 1,
  93        LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
  94        LLAMA_TOKEN_TYPE_CONTROL      = 3,
  95        LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
  96        LLAMA_TOKEN_TYPE_UNUSED       = 5,
  97        LLAMA_TOKEN_TYPE_BYTE         = 6,
  98    };
  99
 100    enum llama_token_attr {
 101        LLAMA_TOKEN_ATTR_UNDEFINED    = 0,
 102        LLAMA_TOKEN_ATTR_UNKNOWN      = 1 << 0,
 103        LLAMA_TOKEN_ATTR_UNUSED       = 1 << 1,
 104        LLAMA_TOKEN_ATTR_NORMAL       = 1 << 2,
 105        LLAMA_TOKEN_ATTR_CONTROL      = 1 << 3,  // SPECIAL?
 106        LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
 107        LLAMA_TOKEN_ATTR_BYTE         = 1 << 5,
 108        LLAMA_TOKEN_ATTR_NORMALIZED   = 1 << 6,
 109        LLAMA_TOKEN_ATTR_LSTRIP       = 1 << 7,
 110        LLAMA_TOKEN_ATTR_RSTRIP       = 1 << 8,
 111        LLAMA_TOKEN_ATTR_SINGLE_WORD  = 1 << 9,
 112    };
 113
 114    // model file types
 115    enum llama_ftype {
 116        LLAMA_FTYPE_ALL_F32              = 0,
 117        LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
 118        LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
 119        LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
 120        // LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
 121        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
 122        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
 123        LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
 124        LLAMA_FTYPE_MOSTLY_Q5_0          = 8,  // except 1d tensors
 125        LLAMA_FTYPE_MOSTLY_Q5_1          = 9,  // except 1d tensors
 126        LLAMA_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
 127        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
 128        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
 129        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
 130        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
 131        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
 132        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
 133        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
 134        LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
 135        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
 136        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
 137        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
 138        LLAMA_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
 139        LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
 140        LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
 141        LLAMA_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
 142        LLAMA_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
 143        LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
 144        LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
 145        LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
 146        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
 147        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
 148        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
 149        //LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // removed from gguf files, use Q4_0 and runtime repack
 150        //LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // removed from gguf files, use Q4_0 and runtime repack
 151        //LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // removed from gguf files, use Q4_0 and runtime repack
 152        LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
 153        LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
 154        LLAMA_FTYPE_MOSTLY_MXFP4_MOE     = 38, // except 1d tensors
 155
 156        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 157    };
 158
 159    enum llama_rope_scaling_type {
 160        LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
 161        LLAMA_ROPE_SCALING_TYPE_NONE        = 0,
 162        LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
 163        LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
 164        LLAMA_ROPE_SCALING_TYPE_LONGROPE    = 3,
 165        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
 166    };
 167
 168    enum llama_pooling_type {
 169        LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
 170        LLAMA_POOLING_TYPE_NONE = 0,
 171        LLAMA_POOLING_TYPE_MEAN = 1,
 172        LLAMA_POOLING_TYPE_CLS  = 2,
 173        LLAMA_POOLING_TYPE_LAST = 3,
 174        LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
 175    };
 176
 177    enum llama_attention_type {
 178        LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
 179        LLAMA_ATTENTION_TYPE_CAUSAL      = 0,
 180        LLAMA_ATTENTION_TYPE_NON_CAUSAL  = 1,
 181    };
 182
 183    enum llama_flash_attn_type {
 184        LLAMA_FLASH_ATTN_TYPE_AUTO     = -1,
 185        LLAMA_FLASH_ATTN_TYPE_DISABLED = 0,
 186        LLAMA_FLASH_ATTN_TYPE_ENABLED  = 1,
 187    };
 188
 189    LLAMA_API const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_type);
 190
 191    enum llama_split_mode {
 192        LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
 193        LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
 194        LLAMA_SPLIT_MODE_ROW   = 2, // split layers and KV across GPUs, use tensor parallelism if supported
 195    };
 196
 197    // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
 198    typedef struct llama_token_data {
 199        llama_token id; // token id
 200        float logit;    // log-odds of the token
 201        float p;        // probability of the token
 202    } llama_token_data;
 203
 204    typedef struct llama_token_data_array {
 205        // TODO: consider SoA
 206        // NOTE: this pointer can be modified by the samplers
 207        llama_token_data * data;
 208        size_t size;
 209        int64_t selected; // this is the index in the data array (i.e. not the token id)
 210        bool sorted;      // note: do not assume the data is sorted - always check this flag
 211    } llama_token_data_array;
 212
 213    typedef bool (*llama_progress_callback)(float progress, void * user_data);
 214
 215    // Input data for llama_encode/llama_decode
 216    // A llama_batch object can contain input about one or many sequences
 217    // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
 218    //
 219    // - token  : the token ids of the input (used when embd is NULL)
 220    // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
 221    // - pos    : the positions of the respective token in the sequence
 222    //            (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
 223    // - seq_id : the sequence to which the respective token belongs
 224    //            (if set to NULL, the sequence ID will be assumed to be 0)
 225    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
 226    //            (if set to NULL:
 227    //               - if embeddings: all tokens are output
 228    //               - if not:        only the last token is output
 229    //            )
 230    //
 231    typedef struct llama_batch {
 232        int32_t n_tokens;
 233
 234        llama_token  *  token;
 235        float        *  embd;
 236        llama_pos    *  pos;
 237        int32_t      *  n_seq_id;
 238        llama_seq_id ** seq_id;
 239        int8_t       *  logits;   // TODO: rename this to "output"
 240    } llama_batch;
 241
 242    enum llama_model_kv_override_type {
 243        LLAMA_KV_OVERRIDE_TYPE_INT,
 244        LLAMA_KV_OVERRIDE_TYPE_FLOAT,
 245        LLAMA_KV_OVERRIDE_TYPE_BOOL,
 246        LLAMA_KV_OVERRIDE_TYPE_STR,
 247    };
 248
 249    enum llama_model_meta_key {
 250        LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE,
 251        LLAMA_MODEL_META_KEY_SAMPLING_TOP_K,
 252        LLAMA_MODEL_META_KEY_SAMPLING_TOP_P,
 253        LLAMA_MODEL_META_KEY_SAMPLING_MIN_P,
 254        LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY,
 255        LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD,
 256        LLAMA_MODEL_META_KEY_SAMPLING_TEMP,
 257        LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N,
 258        LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT,
 259        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT,
 260        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU,
 261        LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA,
 262    };
 263
 264    struct llama_model_kv_override {
 265        enum llama_model_kv_override_type tag;
 266
 267        char key[128];
 268
 269        union {
 270            int64_t val_i64;
 271            double  val_f64;
 272            bool    val_bool;
 273            char    val_str[128];
 274        };
 275    };
 276
 277    struct llama_model_tensor_buft_override {
 278        const char * pattern;
 279        ggml_backend_buffer_type_t buft;
 280    };
 281
 282    struct llama_model_params {
 283        // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
 284        ggml_backend_dev_t * devices;
 285
 286        // NULL-terminated list of buffer types to use for tensors that match a pattern
 287        const struct llama_model_tensor_buft_override * tensor_buft_overrides;
 288
 289        int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
 290        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
 291
 292        // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
 293        int32_t main_gpu;
 294
 295        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
 296        const float * tensor_split;
 297
 298        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 299        // If the provided progress_callback returns true, model loading continues.
 300        // If it returns false, model loading is immediately aborted.
 301        llama_progress_callback progress_callback;
 302
 303        // context pointer passed to the progress callback
 304        void * progress_callback_user_data;
 305
 306        // override key-value pairs of the model meta data
 307        const struct llama_model_kv_override * kv_overrides;
 308
 309        // Keep the booleans together to avoid misalignment during copy-by-value.
 310        bool vocab_only;      // only load the vocabulary, no weights
 311        bool use_mmap;        // use mmap if possible
 312        bool use_direct_io;   // use direct io, takes precedence over use_mmap when supported
 313        bool use_mlock;       // force system to keep model in RAM
 314        bool check_tensors;   // validate model tensor data
 315        bool use_extra_bufts; // use extra buffer types (used for weight repacking)
 316        bool no_host;         // bypass host buffer allowing extra buffers to be used
 317        bool no_alloc;        // only load metadata and simulate memory allocations
 318    };
 319
 320    struct llama_sampler_seq_config {
 321        llama_seq_id           seq_id;
 322        struct llama_sampler * sampler;
 323    };
 324
 325    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
 326    //       https://github.com/ggml-org/llama.cpp/pull/7544
 327    struct llama_context_params {
 328        uint32_t n_ctx;             // text context, 0 = from model
 329        uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
 330        uint32_t n_ubatch;          // physical maximum batch size
 331        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
 332        int32_t  n_threads;         // number of threads to use for generation
 333        int32_t  n_threads_batch;   // number of threads to use for batch processing
 334
 335        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
 336        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
 337        enum llama_attention_type    attention_type;    // attention type to use for embeddings
 338        enum llama_flash_attn_type   flash_attn_type;   // when to enable Flash Attention
 339
 340        // ref: https://github.com/ggml-org/llama.cpp/pull/2054
 341        float    rope_freq_base;   // RoPE base frequency, 0 = from model
 342        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
 343        float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
 344        float    yarn_attn_factor; // YaRN magnitude scaling factor
 345        float    yarn_beta_fast;   // YaRN low correction dim
 346        float    yarn_beta_slow;   // YaRN high correction dim
 347        uint32_t yarn_orig_ctx;    // YaRN original context size
 348        float    defrag_thold;     // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
 349
 350        ggml_backend_sched_eval_callback cb_eval;
 351        void * cb_eval_user_data;
 352
 353        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
 354        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
 355
 356        // Abort callback
 357        // if it returns true, execution of llama_decode() will be aborted
 358        // currently works only with CPU execution
 359        ggml_abort_callback abort_callback;
 360        void *              abort_callback_data;
 361
 362        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
 363        bool embeddings;  // if true, extract embeddings (together with logits)
 364        bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
 365        bool no_perf;     // measure performance timings
 366        bool op_offload;  // offload host tensor operations to device
 367        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
 368                          // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
 369                          //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
 370        bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
 371                          // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
 372                          // ref: https://github.com/ggml-org/llama.cpp/pull/14363
 373
 374        // [EXPERIMENTAL]
 375        // backend sampler chain configuration (make sure the caller keeps the sampler chains alive)
 376        // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init)
 377        struct llama_sampler_seq_config * samplers;
 378        size_t                            n_samplers;
 379    };
 380
 381    // model quantization parameters
 382    typedef struct llama_model_quantize_params {
 383        int32_t nthread;                      // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
 384        enum llama_ftype ftype;               // quantize to this llama_ftype
 385        enum ggml_type output_tensor_type;    // output tensor type
 386        enum ggml_type token_embedding_type;  // token embeddings tensor type
 387        bool allow_requantize;                // allow quantizing non-f32/f16 tensors
 388        bool quantize_output_tensor;          // quantize output.weight
 389        bool only_copy;                       // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
 390        bool pure;                            // quantize all tensors to the default type
 391        bool keep_split;                      // quantize to the same number of shards
 392        void * imatrix;                       // pointer to importance matrix data
 393        void * kv_overrides;                  // pointer to vector containing overrides
 394        void * tensor_types;                  // pointer to vector containing tensor types
 395        void * prune_layers;                  // pointer to vector containing layer indices to prune
 396    } llama_model_quantize_params;
 397
 398    typedef struct llama_logit_bias {
 399        llama_token token;
 400        float bias;
 401    } llama_logit_bias;
 402
 403    typedef struct llama_sampler_chain_params {
 404        bool no_perf; // whether to measure performance timings
 405    } llama_sampler_chain_params;
 406
 407    // used in chat template
 408    typedef struct llama_chat_message {
 409        const char * role;
 410        const char * content;
 411    } llama_chat_message;
 412
 413    // lora adapter
 414    struct llama_adapter_lora;
 415
 416    // Helpers for getting default parameters
 417    // TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172)
 418    LLAMA_API struct llama_model_params          llama_model_default_params(void);
 419    LLAMA_API struct llama_context_params        llama_context_default_params(void);
 420    LLAMA_API struct llama_sampler_chain_params  llama_sampler_chain_default_params(void);
 421    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
 422
 423    // Initialize the llama + ggml backend
 424    // If numa is true, use NUMA optimizations
 425    // Call once at the start of the program
 426    LLAMA_API void llama_backend_init(void);
 427
 428    // Call once at the end of the program - currently only used for MPI
 429    LLAMA_API void llama_backend_free(void);
 430
 431    //optional:
 432    LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
 433
 434    // Optional: an auto threadpool gets created in ggml if not passed explicitly
 435    LLAMA_API void llama_attach_threadpool(
 436            struct llama_context * ctx,
 437               ggml_threadpool_t   threadpool,
 438               ggml_threadpool_t   threadpool_batch);
 439
 440    LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
 441
 442    DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
 443                             const char * path_model,
 444              struct llama_model_params   params),
 445            "use llama_model_load_from_file instead");
 446
 447    // Load the model from a file
 448    // If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
 449    // If the split file name does not follow this pattern, use llama_model_load_from_splits
 450    LLAMA_API struct llama_model * llama_model_load_from_file(
 451                             const char * path_model,
 452              struct llama_model_params   params);
 453
 454    // Load the model from multiple splits (support custom naming scheme)
 455    // The paths must be in the correct order
 456    LLAMA_API struct llama_model * llama_model_load_from_splits(
 457                             const char ** paths,
 458                                 size_t    n_paths,
 459              struct llama_model_params    params);
 460
 461    LLAMA_API void llama_model_save_to_file(
 462            const struct llama_model * model,
 463                        const char * path_model);
 464
 465    DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
 466            "use llama_model_free instead");
 467
 468    LLAMA_API void llama_model_free(struct llama_model * model);
 469
 470    LLAMA_API struct llama_context * llama_init_from_model(
 471                     struct llama_model * model,
 472            struct llama_context_params   params);
 473
 474    DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
 475                     struct llama_model * model,
 476            struct llama_context_params   params),
 477            "use llama_init_from_model instead");
 478
 479    // Frees all allocated memory
 480    LLAMA_API void llama_free(struct llama_context * ctx);
 481
 482    enum llama_params_fit_status {
 483        LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
 484        LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
 485        LLAMA_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occurred, e.g. because no model could be found at the specified path
 486    };
 487
 488    // fits mparams and cparams to free device memory (assumes system memory is unlimited)
 489    //   - returns true if the parameters could be successfully modified to fit device memory
 490    //   - this function is NOT thread safe because it modifies the global llama logger state
 491    //   - only parameters that have the same value as in llama_default_model_params are modified
 492    //     with the exception of the context size which is modified if and only if equal to 0
 493    LLAMA_API enum llama_params_fit_status llama_params_fit(
 494                                   const char   * path_model,
 495                    struct llama_model_params   * mparams,
 496                    struct llama_context_params * cparams,
 497                                          float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
 498        struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
 499                                         size_t * margins,               // margins of memory to leave per device in bytes
 500                                       uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
 501                            enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
 502
 503    LLAMA_API int64_t llama_time_us(void);
 504
 505    LLAMA_API size_t llama_max_devices(void);
 506    LLAMA_API size_t llama_max_parallel_sequences(void);
 507    LLAMA_API size_t llama_max_tensor_buft_overrides(void);
 508
 509    LLAMA_API bool llama_supports_mmap       (void);
 510    LLAMA_API bool llama_supports_mlock      (void);
 511    LLAMA_API bool llama_supports_gpu_offload(void);
 512    LLAMA_API bool llama_supports_rpc        (void);
 513
 514    // NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
 515    //       In some cases the requested values via llama_context_params may differ from the actual values used by the context
 516    //       ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
 517    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
 518    LLAMA_API uint32_t llama_n_ctx_seq  (const struct llama_context * ctx);
 519    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
 520    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
 521    LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
 522
 523    DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
 524    DEPRECATED(LLAMA_API int32_t llama_n_embd     (const struct llama_model * model), "use llama_model_n_embd instead");
 525    DEPRECATED(LLAMA_API int32_t llama_n_layer    (const struct llama_model * model), "use llama_model_n_layer instead");
 526    DEPRECATED(LLAMA_API int32_t llama_n_head     (const struct llama_model * model), "use llama_model_n_head instead");
 527
 528    DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 529
 530    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
 531    LLAMA_API           llama_memory_t   llama_get_memory  (const struct llama_context * ctx);
 532    LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
 533
 534    LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
 535    LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
 536
 537    LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
 538    LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
 539    LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
 540    LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model);
 541    LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
 542    LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
 543    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
 544    LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
 545
 546    // Get the model's RoPE frequency scaling factor
 547    LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
 548
 549    // Returns the number of classifier outputs (only valid for classifier models)
 550    // Undefined behavior for non-classifier models
 551    LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
 552
 553    // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
 554    LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
 555
 556    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
 557
 558    LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
 559
 560    // Functions to access the model's GGUF metadata scalar values
 561    // - The functions return the length of the string on success, or -1 on failure
 562    // - The output string is always null-terminated and cleared on failure
 563    // - When retrieving a string, an extra byte must be allocated to account for the null terminator
 564    // - GGUF array values are not supported by these functions
 565
 566    // Get metadata value as a string by key name
 567    LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
 568
 569    // Get the number of metadata key/value pairs
 570    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
 571
 572    // Get sampling metadata key name. Returns nullptr if the key is invalid
 573    LLAMA_API const char * llama_model_meta_key_str(enum llama_model_meta_key key);
 574
 575    // Get metadata key name by index
 576    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
 577
 578    // Get metadata value as a string by index
 579    LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
 580
 581    // Get a string describing the model type
 582    LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
 583
 584    // Returns the total size of all the tensors in the model in bytes
 585    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
 586
 587    // Get the default chat template. Returns nullptr if not available
 588    // If name is NULL, returns the default chat template
 589    LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
 590
 591    // Returns the total number of parameters in the model
 592    LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
 593
 594    // Returns true if the model contains an encoder that requires llama_encode() call
 595    LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
 596
 597    // Returns true if the model contains a decoder that requires llama_decode() call
 598    LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
 599
 600    // For encoder-decoder models, this function returns id of the token that must be provided
 601    // to the decoder to start generating output sequence. For other models, it returns -1.
 602    LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
 603
 604    // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
 605    LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
 606
 607    // Returns true if the model is hybrid (like Jamba, Granite, etc.)
 608    LLAMA_API bool llama_model_is_hybrid(const struct llama_model * model);
 609
 610    // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
 611    LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
 612
 613    // Returns 0 on success
 614    LLAMA_API uint32_t llama_model_quantize(
 615            const char * fname_inp,
 616            const char * fname_out,
 617            const llama_model_quantize_params * params);
 618
 619    //
 620    // Adapters
 621    //
 622
 623    // Load a LoRA adapter from file
 624    // The adapter is valid as long as the associated model is not freed
 625    // All adapters must be loaded before context creation
 626    LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
 627            struct llama_model * model,
 628            const char * path_lora);
 629
 630    // Functions to access the adapter's GGUF metadata scalar values
 631    // - The functions return the length of the string on success, or -1 on failure
 632    // - The output string is always null-terminated and cleared on failure
 633    // - When retrieving a string, an extra byte must be allocated to account for the null terminator
 634    // - GGUF array values are not supported by these functions
 635
 636    // Get metadata value as a string by key name
 637    LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
 638
 639    // Get the number of metadata key/value pairs
 640    LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
 641
 642    // Get metadata key name by index
 643    LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
 644
 645    // Get metadata value as a string by index
 646    LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
 647
 648    // Manually free a LoRA adapter
 649    // NOTE: loaded adapters will be free when the associated model is deleted
 650    LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
 651            "adapters are now freed together with the associated model");
 652
 653    // Get the invocation tokens if the current lora is an alora
 654    LLAMA_API uint64_t            llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
 655    LLAMA_API const llama_token * llama_adapter_get_alora_invocation_tokens  (const struct llama_adapter_lora * adapter);
 656
 657    // The following functions operate on a llama_context, hence the naming: llama_verb_...
 658
 659    // Add a loaded LoRA adapter to given context
 660    // This will not modify model's weight
 661    LLAMA_API int32_t llama_set_adapter_lora(
 662            struct llama_context * ctx,
 663            struct llama_adapter_lora * adapter,
 664            float scale);
 665
 666    // Remove a specific LoRA adapter from given context
 667    // Return -1 if the adapter is not present in the context
 668    LLAMA_API int32_t llama_rm_adapter_lora(
 669            struct llama_context * ctx,
 670            struct llama_adapter_lora * adapter);
 671
 672    // Remove all LoRA adapters from given context
 673    LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
 674
 675    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
 676    // the currently loaded vector.
 677    // n_embd should be the size of a single layer's control, and data should point
 678    // to an n_embd x n_layers buffer starting from layer 1.
 679    // il_start and il_end are the layer range the vector should apply to (both inclusive)
 680    // See llama_control_vector_load in common to load a control vector.
 681    LLAMA_API int32_t llama_apply_adapter_cvec(
 682            struct llama_context * ctx,
 683                     const float * data,
 684                          size_t   len,
 685                         int32_t   n_embd,
 686                         int32_t   il_start,
 687                         int32_t   il_end);
 688
 689    //
 690    // Memory
 691    //
 692
 693    // Clear the memory contents
 694    // If data == true, the data buffers will also be cleared together with the metadata
 695    LLAMA_API void llama_memory_clear(
 696            llama_memory_t mem,
 697                      bool data);
 698
 699    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
 700    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
 701    // seq_id < 0 : match any sequence
 702    // p0 < 0     : [0,  p1]
 703    // p1 < 0     : [p0, inf)
 704    LLAMA_API bool llama_memory_seq_rm(
 705            llama_memory_t mem,
 706              llama_seq_id seq_id,
 707                 llama_pos p0,
 708                 llama_pos p1);
 709
 710    // Copy all tokens that belong to the specified sequence to another sequence
 711    // p0 < 0 : [0,  p1]
 712    // p1 < 0 : [p0, inf)
 713    LLAMA_API void llama_memory_seq_cp(
 714            llama_memory_t mem,
 715              llama_seq_id seq_id_src,
 716              llama_seq_id seq_id_dst,
 717                 llama_pos p0,
 718                 llama_pos p1);
 719
 720    // Removes all tokens that do not belong to the specified sequence
 721    LLAMA_API void llama_memory_seq_keep(
 722            llama_memory_t mem,
 723              llama_seq_id seq_id);
 724
 725    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
 726    // p0 < 0 : [0,  p1]
 727    // p1 < 0 : [p0, inf)
 728    LLAMA_API void llama_memory_seq_add(
 729            llama_memory_t mem,
 730              llama_seq_id seq_id,
 731                 llama_pos p0,
 732                 llama_pos p1,
 733                 llama_pos delta);
 734
 735    // Integer division of the positions by factor of `d > 1`
 736    // p0 < 0 : [0,  p1]
 737    // p1 < 0 : [p0, inf)
 738    LLAMA_API void llama_memory_seq_div(
 739            llama_memory_t mem,
 740              llama_seq_id seq_id,
 741                 llama_pos p0,
 742                 llama_pos p1,
 743                       int d);
 744
 745    // Returns the smallest position present in the memory for the specified sequence
 746    // This is typically non-zero only for SWA caches
 747    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
 748    // Return -1 if the sequence is empty
 749    LLAMA_API llama_pos llama_memory_seq_pos_min(
 750            llama_memory_t mem,
 751              llama_seq_id seq_id);
 752
 753    // Returns the largest position present in the memory for the specified sequence
 754    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
 755    // Return -1 if the sequence is empty
 756    LLAMA_API llama_pos llama_memory_seq_pos_max(
 757            llama_memory_t mem,
 758              llama_seq_id seq_id);
 759
 760    // Check if the memory supports shifting
 761    LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
 762
 763    //
 764    // State / sessions
 765    //
 766
 767    // Returns the *actual* size in bytes of the state
 768    // (logits, embedding and memory)
 769    // Only use when saving the state, not when restoring it, otherwise the size may be too small.
 770    LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
 771    LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
 772        "use llama_state_get_size instead");
 773
 774    // Copies the state to the specified destination address.
 775    // Destination needs to have allocated enough memory.
 776    // Returns the number of bytes copied
 777    LLAMA_API size_t llama_state_get_data(
 778            struct llama_context * ctx,
 779                         uint8_t * dst,
 780                          size_t   size);
 781    LLAMA_API DEPRECATED(size_t llama_copy_state_data(
 782            struct llama_context * ctx,
 783                         uint8_t * dst),
 784        "use llama_state_get_data instead");
 785
 786    // Set the state reading from the specified address
 787    // Returns the number of bytes read
 788    LLAMA_API size_t llama_state_set_data(
 789            struct llama_context * ctx,
 790                   const uint8_t * src,
 791                          size_t   size);
 792    LLAMA_API DEPRECATED(size_t llama_set_state_data(
 793            struct llama_context * ctx,
 794                   const uint8_t * src),
 795        "use llama_state_set_data instead");
 796
 797    // Save/load session file
 798    LLAMA_API bool llama_state_load_file(
 799            struct llama_context * ctx,
 800                      const char * path_session,
 801                     llama_token * tokens_out,
 802                          size_t   n_token_capacity,
 803                          size_t * n_token_count_out);
 804    LLAMA_API DEPRECATED(bool llama_load_session_file(
 805            struct llama_context * ctx,
 806                      const char * path_session,
 807                     llama_token * tokens_out,
 808                          size_t   n_token_capacity,
 809                          size_t * n_token_count_out),
 810        "use llama_state_load_file instead");
 811
 812    LLAMA_API bool llama_state_save_file(
 813            struct llama_context * ctx,
 814                      const char * path_session,
 815               const llama_token * tokens,
 816                          size_t   n_token_count);
 817    LLAMA_API DEPRECATED(bool llama_save_session_file(
 818            struct llama_context * ctx,
 819                      const char * path_session,
 820               const llama_token * tokens,
 821                          size_t   n_token_count),
 822        "use llama_state_save_file instead");
 823
 824    // Get the exact size needed to copy the state of a single sequence
 825    LLAMA_API size_t llama_state_seq_get_size(
 826            struct llama_context * ctx,
 827                    llama_seq_id   seq_id);
 828
 829    // Copy the state of a single sequence into the specified buffer
 830    LLAMA_API size_t llama_state_seq_get_data(
 831            struct llama_context * ctx,
 832                         uint8_t * dst,
 833                          size_t   size,
 834                    llama_seq_id   seq_id);
 835
 836    // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
 837    // Returns:
 838    //  - Positive: Ok
 839    //  - Zero: Failed to load
 840    LLAMA_API size_t llama_state_seq_set_data(
 841            struct llama_context * ctx,
 842                   const uint8_t * src,
 843                          size_t   size,
 844                    llama_seq_id   dest_seq_id);
 845
 846    LLAMA_API size_t llama_state_seq_save_file(
 847            struct llama_context * ctx,
 848                      const char * filepath,
 849                    llama_seq_id   seq_id,
 850               const llama_token * tokens,
 851                          size_t   n_token_count);
 852
 853    LLAMA_API size_t llama_state_seq_load_file(
 854            struct llama_context * ctx,
 855                      const char * filepath,
 856                    llama_seq_id   dest_seq_id,
 857                     llama_token * tokens_out,
 858                          size_t   n_token_capacity,
 859                          size_t * n_token_count_out);
 860
 861// for backwards-compat
 862#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
 863
 864// work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
 865#define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
 866
 867    typedef uint32_t llama_state_seq_flags;
 868
 869    LLAMA_API size_t llama_state_seq_get_size_ext(
 870            struct llama_context * ctx,
 871                    llama_seq_id   seq_id,
 872           llama_state_seq_flags   flags);
 873
 874    LLAMA_API size_t llama_state_seq_get_data_ext(
 875            struct llama_context * ctx,
 876                         uint8_t * dst,
 877                          size_t   size,
 878                    llama_seq_id   seq_id,
 879           llama_state_seq_flags   flags);
 880
 881    LLAMA_API size_t llama_state_seq_set_data_ext(
 882            struct llama_context * ctx,
 883                   const uint8_t * src,
 884                          size_t   size,
 885                    llama_seq_id   dest_seq_id,
 886           llama_state_seq_flags   flags);
 887
 888    //
 889    // Decoding
 890    //
 891
 892    // Return batch for single sequence of tokens
 893    // The sequence ID will be fixed to 0
 894    // The position of the tokens will be tracked automatically by llama_decode
 895    //
 896    // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
 897    //
 898    LLAMA_API struct llama_batch llama_batch_get_one(
 899                  llama_token * tokens,
 900                      int32_t   n_tokens);
 901
 902    // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
 903    // Each token can be assigned up to n_seq_max sequence ids
 904    // The batch has to be freed with llama_batch_free()
 905    // If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
 906    // Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
 907    // The rest of the llama_batch members are allocated with size n_tokens
 908    // All members are left uninitialized
 909    LLAMA_API struct llama_batch llama_batch_init(
 910            int32_t n_tokens,
 911            int32_t embd,
 912            int32_t n_seq_max);
 913
 914    // Frees a batch of tokens allocated with llama_batch_init()
 915    LLAMA_API void llama_batch_free(struct llama_batch batch);
 916
 917    // Process a batch of tokens.
 918    // In contrast to llama_decode() - this call does not use KV cache.
 919    // For encode-decoder contexts, processes the batch using the encoder.
 920    // Can store the encoder output internally for later use by the decoder's cross-attention layers.
 921    //   0 - success
 922    // < 0 - error. the memory state is restored to the state before this call
 923    LLAMA_API int32_t llama_encode(
 924            struct llama_context * ctx,
 925              struct llama_batch   batch);
 926
 927    // Process a batch of tokens.
 928    // Requires the context to have a memory.
 929    // For encode-decoder contexts, processes the batch using the decoder.
 930    // Positive return values does not mean a fatal error, but rather a warning.
 931    // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
 932    //   To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
 933    // Upon other return values, the memory state is restored to the state before this call
 934    //    0 - success
 935    //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
 936    //    2 - aborted     (processed ubatches will remain in the context's memory)
 937    //   -1 - invalid input batch
 938    // < -1 - fatal error (processed ubatches will remain in the context's memory)
 939    LLAMA_API int32_t llama_decode(
 940            struct llama_context * ctx,
 941              struct llama_batch   batch);
 942
 943    // Set the number of threads used for decoding
 944    // n_threads is the number of threads used for generation (single token)
 945    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
 946    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
 947
 948    // Get the number of threads used for generation of a single token.
 949    LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
 950
 951    // Get the number of threads used for prompt and batch processing (multiple token).
 952    LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
 953
 954    // Set whether the context outputs embeddings or not
 955    // TODO: rename to avoid confusion with llama_get_embeddings()
 956    LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
 957
 958    // Set whether to use causal attention or not
 959    // If set to true, the model will only attend to the past tokens
 960    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
 961
 962    // Set whether the model is in warmup mode or not
 963    // If true, all model tensors are activated during llama_decode() to load and cache their weights.
 964    LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
 965
 966    // Set abort callback
 967    LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
 968
 969    // Wait until all computations are finished
 970    // This is automatically done when using one of the functions below to obtain the computation results
 971    // and is not necessary to call it explicitly in most cases
 972    LLAMA_API void llama_synchronize(struct llama_context * ctx);
 973
 974    // Token logits obtained from the last call to llama_decode()
 975    // The logits for which llama_batch.logits[i] != 0 are stored contiguously
 976    // in the order they have appeared in the batch.
 977    // Rows: number of tokens for which llama_batch.logits[i] != 0
 978    // Cols: n_vocab
 979    // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
 980    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
 981
 982    // Logits for the ith token. For positive indices, Equivalent to:
 983    // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
 984    // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
 985    // returns NULL for invalid ids.
 986    LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
 987
 988    // Get all output token embeddings.
 989    // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
 990    // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
 991    // in the order they have appeared in the batch.
 992    // shape: [n_outputs*n_embd]
 993    // Otherwise, returns NULL.
 994    // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
 995    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 996
 997    // Get the embeddings for the ith token. For positive indices, Equivalent to:
 998    // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
 999    // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
1000    // shape: [n_embd] (1-dimensional)
1001    // returns NULL for invalid ids.
1002    LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
1003
1004    // Get the embeddings for a sequence id
1005    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
1006    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
1007    // otherwise: float[n_embd] (1-dimensional)
1008    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
1009
1010    //
1011    // backend sampling API [EXPERIMENTAL]
1012    // note: use only if the llama_context was created with at least one llama_sampler_seq_config
1013    //
1014
1015    // Get the backend sampled token for the ith token.
1016    // Returns LLAMA_TOKEN_NULL if no token was sampled.
1017    LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
1018
1019    // Get the backend sampled probabilites for the ith token
1020    // The index matches llama_get_sampled_token_ith().
1021    // Returns NULL if no probabilites were generated.
1022    LLAMA_API float *  llama_get_sampled_probs_ith      (struct llama_context * ctx, int32_t i);
1023    LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
1024
1025    // Get the backend sampled logits for the ith token
1026    // Returns NULL if no logits were sampled.
1027    LLAMA_API float *  llama_get_sampled_logits_ith      (struct llama_context * ctx, int32_t i);
1028    LLAMA_API uint32_t llama_get_sampled_logits_count_ith(struct llama_context * ctx, int32_t i);
1029
1030    // Get the backend sampled candidates (token ids) for the ith token
1031    // These are needed to map probability/logit indices to vocab token ids.
1032    // Returns NULL if no candidates were sampled.
1033    LLAMA_API llama_token * llama_get_sampled_candidates_ith      (struct llama_context * ctx, int32_t i);
1034    LLAMA_API uint32_t      llama_get_sampled_candidates_count_ith(struct llama_context * ctx, int32_t i);
1035
1036    //
1037    // Vocab
1038    //
1039
1040    LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
1041
1042    LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
1043
1044    LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
1045
1046    // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
1047    LLAMA_API bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token);
1048
1049    // Identify if Token Id is a control token or a render-able token
1050    LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
1051
1052    // Special tokens
1053    LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
1054    LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
1055    LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
1056    LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
1057    LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
1058    LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
1059    LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
1060
1061    LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1062    LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1063    LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
1064
1065    LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
1066    LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
1067    LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
1068    LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
1069    LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
1070    LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
1071
1072    DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead");
1073    DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
1074    DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
1075    DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
1076    DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
1077    DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
1078    DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
1079    DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
1080    DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
1081    DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
1082    DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
1083    DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
1084    DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
1085    DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
1086    DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
1087    DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
1088    DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
1089    DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
1090    DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
1091    DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
1092
1093    // CLS is equivalent to BOS
1094    DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
1095            "use llama_vocab_bos instead");
1096
1097    //
1098    // Tokenization
1099    //
1100    // The API is thread-safe.
1101    //
1102
1103    /// @details Convert the provided text into tokens.
1104    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
1105    /// @return Returns the number of tokens on success, no more than n_tokens_max
1106    /// @return Returns a negative number on failure - the number of tokens that would have been returned
1107    /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
1108    /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
1109    /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
1110    ///                      as plaintext. Does not insert a leading space.
1111    LLAMA_API int32_t llama_tokenize(
1112        const struct llama_vocab * vocab,
1113                      const char * text,
1114                         int32_t   text_len,
1115                     llama_token * tokens,
1116                         int32_t   n_tokens_max,
1117                            bool   add_special,
1118                            bool   parse_special);
1119
1120    // Token Id -> Piece.
1121    // Uses the vocabulary in the provided context.
1122    // Does not write null terminator to the buffer.
1123    // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
1124    // @param special If true, special tokens are rendered in the output.
1125    LLAMA_API int32_t llama_token_to_piece(
1126              const struct llama_vocab * vocab,
1127                           llama_token   token,
1128                                  char * buf,
1129                               int32_t   length,
1130                               int32_t   lstrip,
1131                                  bool   special);
1132
1133    /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
1134    /// @param text The char pointer must be large enough to hold the resulting text.
1135    /// @return Returns the number of chars/bytes on success, no more than text_len_max.
1136    /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
1137    /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
1138    /// @param unparse_special If true, special tokens are rendered in the output.
1139    LLAMA_API int32_t llama_detokenize(
1140        const struct llama_vocab * vocab,
1141               const llama_token * tokens,
1142                         int32_t   n_tokens,
1143                            char * text,
1144                         int32_t   text_len_max,
1145                            bool   remove_special,
1146                            bool   unparse_special);
1147
1148    //
1149    // Chat templates
1150    //
1151
1152    /// Apply chat template. Inspired by hf apply_chat_template() on python.
1153    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
1154    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
1155    /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
1156    /// @param chat Pointer to a list of multiple llama_chat_message
1157    /// @param n_msg Number of llama_chat_message in this chat
1158    /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
1159    /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
1160    /// @param length The size of the allocated buffer
1161    /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
1162    LLAMA_API int32_t llama_chat_apply_template(
1163                            const char * tmpl,
1164       const struct llama_chat_message * chat,
1165                                size_t   n_msg,
1166                                  bool   add_ass,
1167                                  char * buf,
1168                               int32_t   length);
1169
1170    // Get list of built-in chat templates
1171    LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
1172
1173    //
1174    // Sampling API
1175    //
1176    // Sample usage:
1177    //
1178    //    // prepare the sampling chain at the start
1179    //    auto sparams = llama_sampler_chain_default_params();
1180    //
1181    //    llama_sampler * smpl = llama_sampler_chain_init(sparams);
1182    //
1183    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
1184    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
1185    //    llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8));
1186    //
1187    //    // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
1188    //    // this sampler will be responsible to select the actual token
1189    //    llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed));
1190    //
1191    //    ...
1192    //
1193    //    // decoding loop:
1194    //    while (...) {
1195    //        ...
1196    //
1197    //        llama_decode(ctx, batch);
1198    //
1199    //        // sample from the logits of the last token in the batch
1200    //        const llama_token id = llama_sampler_sample(smpl, ctx, -1);
1201    //
1202    //        ...
1203    //    }
1204    //
1205    //    llama_sampler_free(smpl);
1206    //
1207
1208    typedef void * llama_sampler_context_t;
1209
1210    struct llama_sampler_data {
1211        struct ggml_tensor * logits;
1212        struct ggml_tensor * probs;
1213        struct ggml_tensor * sampled;
1214        struct ggml_tensor * candidates;
1215    };
1216
1217    // user code can implement the interface below in order to create custom llama_sampler
1218    struct llama_sampler_i {
1219        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
1220        void                   (*accept)(      struct llama_sampler * smpl, llama_token token);              // can be NULL
1221        void                   (*apply) (      struct llama_sampler * smpl, llama_token_data_array * cur_p); // required
1222        void                   (*reset) (      struct llama_sampler * smpl);                                 // can be NULL
1223        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
1224        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
1225
1226        // [EXPERIMENTAL]
1227        // backend sampling interface:
1228
1229        // return true if the backend supports all ops needed by the sampler
1230        // note: call once per sampler
1231        bool (*backend_init)(struct llama_sampler * smpl, ggml_backend_buffer_type_t buft);
1232
1233        // call after .backend_apply()
1234        void (*backend_accept)(
1235                struct llama_sampler * smpl,
1236                struct ggml_context  * ctx,
1237                struct ggml_cgraph   * gf,
1238                struct ggml_tensor   * selected_token);
1239
1240        // call after .backend_init()
1241        void (*backend_apply)(
1242                struct llama_sampler      * smpl,
1243                struct ggml_context       * ctx,
1244                struct ggml_cgraph        * gf,
1245                struct llama_sampler_data * data);
1246
1247        // called before graph execution to set inputs for the current ubatch
1248        void (*backend_set_input)(struct llama_sampler * smpl);
1249    };
1250
1251    struct llama_sampler {
1252        struct llama_sampler_i * iface;
1253
1254        llama_sampler_context_t ctx;
1255    };
1256
1257    // [EXPERIMENTAL]
1258    // attach a sampler to the context
1259    // note: prefer initializing the context with llama_context_params.samplers when possible
1260    LLAMA_API bool llama_set_sampler(struct llama_context * ctx, llama_seq_id seq_id, struct llama_sampler * smpl);
1261
1262    // mirror of llama_sampler_i:
1263    LLAMA_API struct llama_sampler * llama_sampler_init  (      struct llama_sampler_i * iface, llama_sampler_context_t ctx);
1264    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
1265    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
1266    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
1267    LLAMA_API void                   llama_sampler_reset (      struct llama_sampler * smpl);
1268    LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
1269    // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
1270    LLAMA_API void                   llama_sampler_free  (      struct llama_sampler * smpl);
1271
1272    // llama_sampler_chain
1273    // a type of llama_sampler that can chain multiple samplers one after another
1274
1275    LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
1276
1277    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
1278    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
1279
1280    // return NULL if:
1281    //   - the sampler is NULL
1282    //   - the sampler is not a llama_sampler_chain
1283    //   - the index is out of bounds, unless i == -1
1284    //   - if i == -1, returns the chain itself (can be used to check if the sampler is a chain)
1285    LLAMA_API struct llama_sampler * llama_sampler_chain_get(      struct llama_sampler * chain, int32_t i);
1286
1287    // the total number of samplers in the chain
1288    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
1289
1290    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
1291    LLAMA_API struct llama_sampler * llama_sampler_chain_remove(   struct llama_sampler * chain, int32_t i);
1292
1293    // available samplers:
1294
1295    LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
1296
1297    /// seed == LLAMA_DEFAULT_SEED to use a random seed.
1298    LLAMA_API struct llama_sampler * llama_sampler_init_dist(uint32_t seed);
1299
1300    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1301    /// Setting k <= 0 makes this a noop
1302    LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
1303
1304    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1305    LLAMA_API struct llama_sampler * llama_sampler_init_top_p      (float   p, size_t min_keep);
1306
1307    /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
1308    LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);
1309
1310    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1311    LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
1312
1313    /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
1314    LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
1315
1316    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
1317    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
1318
1319    /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
1320    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
1321
1322    /// @details Top n sigma sampling as described in academic paper "Top-nσ: Not All Logits Are You Need" https://arxiv.org/pdf/2411.07641
1323    LLAMA_API struct llama_sampler * llama_sampler_init_top_n_sigma(float   n);
1324
1325    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1326    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1327    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1328    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1329    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
1330    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1331    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
1332                             int32_t   n_vocab,
1333                            uint32_t   seed,
1334                               float   tau,
1335                               float   eta,
1336                             int32_t   m);
1337
1338    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1339    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1340    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
1341    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
1342    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
1343    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
1344                            uint32_t   seed,
1345                               float   tau,
1346                               float   eta);
1347
1348    /// @details Intializes a GBNF grammar, see grammars/README.md for details.
1349    /// @param vocab The vocabulary that this grammar will be used with.
1350    /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
1351    /// @param grammar_root The name of the start symbol for the grammar.
1352    LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
1353            const struct llama_vocab * vocab,
1354                          const char * grammar_str,
1355                          const char * grammar_root);
1356
1357    DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
1358            const struct llama_vocab * vocab,
1359                          const char * grammar_str,
1360                          const char * grammar_root,
1361                         const char ** trigger_words,
1362                                size_t num_trigger_words,
1363                   const llama_token * trigger_tokens,
1364                                size_t num_trigger_tokens),
1365        "use llama_sampler_init_grammar_lazy_patterns instead");
1366
1367
1368    /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
1369    /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
1370    /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
1371    LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
1372        const struct llama_vocab * vocab,
1373                      const char * grammar_str,
1374                      const char * grammar_root,
1375                     const char ** trigger_patterns,
1376                            size_t num_trigger_patterns,
1377               const llama_token * trigger_tokens,
1378                            size_t num_trigger_tokens);
1379
1380
1381    /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
1382    LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
1383                             int32_t   penalty_last_n,   // last n tokens to penalize (0 = disable penalty, -1 = context size)
1384                               float   penalty_repeat,   // 1.0 = disabled
1385                               float   penalty_freq,     // 0.0 = disabled
1386                               float   penalty_present); // 0.0 = disabled
1387
1388    ///  @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
1389    LLAMA_API struct llama_sampler * llama_sampler_init_dry(
1390            const struct llama_vocab *  vocab,
1391                             int32_t    n_ctx_train,
1392                               float    dry_multiplier,
1393                               float    dry_base,
1394                             int32_t    dry_allowed_length,
1395                             int32_t    dry_penalty_last_n,
1396                          const char ** seq_breakers,
1397                              size_t    num_breakers);
1398
1399    /// adaptive-p: select tokens near a configurable target probability over time.
1400    ///
1401    /// the adaptive-p sampler transforms the token probability distribution to favor tokens
1402    /// that fall near a user-configurable probability target.
1403    ///
1404    /// internally, the sampler maintains an exponential moving average of the *ORIGINAL*
1405    /// probabilities of selected tokens at each sampling step. it uses this EMA to compute an
1406    /// adapted target probability at each sampling step, thus maintaining the desired target
1407    /// probability over time.
1408    ///
1409    /// adaptive-p selects a token ID rather than just mutating candidates, so it must be last
1410    /// in the sampler chain (like mirostat, dist, greedy).
1411    ///
1412    /// only mild truncation before this sampler is recommended. we suggest applying min-p
1413    /// before adaptive-p as the only other active sampler in the chain.
1414    ///
1415    /// @param target select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
1416    /// @param decay  EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
1417    /// @param seed   RNG seed
1418    ///
1419    /// ref: https://github.com/ggml-org/llama.cpp/pull/17927
1420    ///
1421    LLAMA_API struct llama_sampler * llama_sampler_init_adaptive_p(
1422                               float   target,
1423                               float   decay,
1424                            uint32_t   seed);
1425
1426    LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
1427                             int32_t   n_vocab,
1428                             int32_t   n_logit_bias,
1429              const llama_logit_bias * logit_bias);
1430
1431    // this sampler is meant to be used for fill-in-the-middle infilling
1432    // it's supposed to be used after top_k + top_p sampling
1433    //
1434    // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
1435    // 2. combine probs of tokens that have the same prefix
1436    //
1437    // example:
1438    //
1439    // - before:
1440    //   "hel":   0.5
1441    //   "hell":  0.2
1442    //   "hello": 0.1
1443    //   "dummy": 0.1
1444    //
1445    // - after:
1446    //   "hel":   0.8
1447    //   "dummy": 0.1
1448    //
1449    // 3. discard non-EOG tokens with low prob
1450    // 4. if no tokens are left -> pick EOT
1451    //
1452    LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
1453
1454    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
1455    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
1456
1457    /// @details Sample and accept a token from the idx-th output of the last evaluation
1458    //
1459    // Shorthand for:
1460    //    const auto * logits = llama_get_logits_ith(ctx, idx);
1461    //    llama_token_data_array cur_p = { ... init from logits ... };
1462    //    llama_sampler_apply(smpl, &cur_p);
1463    //    auto token = cur_p.data[cur_p.selected].id;
1464    //    llama_sampler_accept(smpl, token);
1465    //    return token;
1466    // Returns the sampled token
1467    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
1468
1469    // TODO: extend in the future
1470    //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
1471
1472    //
1473    // Model split
1474    //
1475
1476    /// @details Build a split GGUF final path for this chunk.
1477    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
1478    //  Returns the split_path length.
1479    LLAMA_API int32_t llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int32_t split_no, int32_t split_count);
1480
1481    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
1482    ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
1483    //  Returns the split_prefix length.
1484    LLAMA_API int32_t llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int32_t split_no, int32_t split_count);
1485
1486    // Print system information
1487    LLAMA_API const char * llama_print_system_info(void);
1488
1489    // Set callback for all future logging events.
1490    // If this is not called, or NULL is supplied, everything is output on stderr.
1491    // The logger state is global so these functions are NOT thread safe.
1492    LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data);
1493    LLAMA_API void llama_log_set(ggml_log_callback   log_callback, void *  user_data);
1494
1495    //
1496    // Performance utils
1497    //
1498    // NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
1499    //
1500
1501    struct llama_perf_context_data {
1502        // ms == milliseconds
1503        double t_start_ms;  // absolute start time
1504        double t_load_ms;   // time needed for loading the model
1505        double t_p_eval_ms; // time needed for processing the prompt
1506        double t_eval_ms;   // time needed for generating tokens
1507
1508        int32_t n_p_eval;   // number of prompt tokens
1509        int32_t n_eval;     // number of generated tokens
1510        int32_t n_reused;   // number of times a ggml compute graph had been reused
1511    };
1512
1513    struct llama_perf_sampler_data {
1514        double t_sample_ms; // time needed for sampling in ms
1515
1516        int32_t n_sample;   // number of sampled tokens
1517    };
1518
1519    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
1520    LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
1521    LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
1522
1523    // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
1524    LLAMA_API struct llama_perf_sampler_data llama_perf_sampler      (const struct llama_sampler * chain);
1525    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
1526    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
1527
1528    // print a breakdown of per-device memory use via LLAMA_LOG:
1529    LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
1530
1531    //
1532    // training
1533    //
1534
1535    // function that returns whether or not a given tensor contains trainable parameters
1536    typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
1537
1538    // always returns true
1539    LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
1540
1541    struct llama_opt_params {
1542        uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
1543
1544        llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
1545        void * param_filter_ud;              // userdata for determining which tensors contain trainable parameters
1546
1547        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
1548        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
1549
1550        enum ggml_opt_optimizer_type optimizer_type;
1551    };
1552
1553    LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
1554
1555    LLAMA_API void llama_opt_epoch(
1556            struct llama_context    * lctx,
1557            ggml_opt_dataset_t        dataset,
1558            ggml_opt_result_t         result_train,
1559            ggml_opt_result_t         result_eval,
1560            int64_t                   idata_split,
1561            ggml_opt_epoch_callback   callback_train,
1562            ggml_opt_epoch_callback   callback_eval);
1563
1564#ifdef __cplusplus
1565}
1566#endif
1567
1568#endif // LLAMA_H