summaryrefslogtreecommitdiff
path: root/llama.cpp/tools/completion
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/tools/completion
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/tools/completion')
-rw-r--r--llama.cpp/tools/completion/CMakeLists.txt8
-rw-r--r--llama.cpp/tools/completion/README.md578
-rw-r--r--llama.cpp/tools/completion/completion.cpp1001
3 files changed, 1587 insertions, 0 deletions
diff --git a/llama.cpp/tools/completion/CMakeLists.txt b/llama.cpp/tools/completion/CMakeLists.txt
new file mode 100644
index 0000000..126ae6a
--- /dev/null
+++ b/llama.cpp/tools/completion/CMakeLists.txt
@@ -0,0 +1,8 @@
1set(TARGET llama-completion)
2add_executable(${TARGET} completion.cpp)
3target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4target_compile_features(${TARGET} PRIVATE cxx_std_17)
5
6if(LLAMA_TOOLS_INSTALL)
7 install(TARGETS ${TARGET} RUNTIME)
8endif()
diff --git a/llama.cpp/tools/completion/README.md b/llama.cpp/tools/completion/README.md
new file mode 100644
index 0000000..3ca3e68
--- /dev/null
+++ b/llama.cpp/tools/completion/README.md
@@ -0,0 +1,578 @@
1# llama.cpp/tools/completion
2
3This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggml-org/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
4
5## Table of Contents
6
71. [Quick Start](#quick-start)
82. [Usage](#usage)
93. [Common Options](#common-options)
104. [Input Prompts](#input-prompts)
115. [Interaction](#interaction)
126. [Context Management](#context-management)
137. [Generation Flags](#generation-flags)
148. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
159. [Additional Options](#additional-options)
16
17## Quick Start
18
19To get started right away, run the following command, making sure to use the correct path for the model you have:
20
21First, we will need to download a model. In these examples, we will use the Gemma model from the ggml-org repo on Hugging Face.
22[https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)
23
24Once downloaded, place your model in the models folder in llama.cpp.
25
26### Unix-based systems (Linux, macOS, etc.):
27
28##### Input prompt (One-and-done)
29
30```bash
31./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
32```
33##### Conversation mode (Allow for continuous interaction with the model)
34
35```bash
36./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
37```
38
39##### Conversation mode using built-in jinja chat template
40
41```bash
42./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja
43```
44
45##### One-and-done query using jinja with custom system prompt and a starting prompt
46
47```bash
48./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
49```
50
51##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
52```bash
53./llama-completion -m models/gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
54```
55
56### Windows:
57
58##### Input prompt (One-and-done)
59```powershell
60./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time"
61```
62##### Conversation mode (Allow for continuous interaction with the model)
63
64```powershell
65./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma
66```
67
68##### Conversation mode using built-in jinja chat template
69
70```powershell
71./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja
72```
73
74##### One-and-done query using jinja with custom system prompt and a starting prompt
75
76```powershell
77./llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello"
78```
79
80#### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
81
82```powershell
83llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
84```
85
86## Usage
87
88<!-- HELP_START -->
89
90<!-- IMPORTANT: The list below is auto-generated by llama-gen-docs; do NOT modify it manually -->
91
92### Common params
93
94| Argument | Explanation |
95| -------- | ----------- |
96| `-h, --help, --usage` | print usage and exit |
97| `--version` | show version and build info |
98| `--license` | show source code license and dependencies |
99| `-cl, --cache-list` | show list of models in cache |
100| `--completion-bash` | print source-able bash completion script for llama.cpp |
101| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
102| `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
103| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
104| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
105| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
106| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0) |
107| `--prio N` | set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: 0) |
108| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50) |
109| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
110| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
111| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
112| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0) |
113| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
114| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
115| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
116| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
117| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
118| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
119| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
120| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
121| `-p, --prompt PROMPT` | prompt to start generation with; for system message, use -sys |
122| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
123| `-f, --file FNAME` | a file containing the prompt (default: none) |
124| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
125| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
126| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
127| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
128| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
129| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
130| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
131| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
132| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
133| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
134| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
135| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
136| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
137| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
138| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
139| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
140| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
141| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
142| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
143| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
144| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
145| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
146| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
147| `--list-devices` | print list of available devices and exit |
148| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type<br/>(env: LLAMA_ARG_OVERRIDE_TENSOR) |
149| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
150| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
151| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
152| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
153| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
154| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
155| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')<br/>(env: LLAMA_ARG_FIT) |
156| `-fitt, --fit-target MiB0,MiB1,MiB2,...` | target margin per device for --fit, comma-separated list of values, single value is broadcast across all devices, default: 1024<br/>(env: LLAMA_ARG_FIT_TARGET) |
157| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096<br/>(env: LLAMA_ARG_FIT_CTX) |
158| `--check-tensors` | check model tensor data for invalid values (default: false) |
159| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false |
160| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
161| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) |
162| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)<br/>note: use comma-separated values |
163| `--control-vector FNAME` | add a control vector<br/>note: use comma-separated values to add multiple control vectors |
164| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE<br/>note: use comma-separated values (format: FNAME:SCALE,...) |
165| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
166| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
167| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
168| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
169| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
170| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
171| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
172| `-hfv, -hfrv, --hf-repo-v <user>/<model>[:quant]` | Hugging Face model repository for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_REPO_V) |
173| `-hffv, --hf-file-v FILE` | Hugging Face model file for the vocoder model (default: unused)<br/>(env: LLAMA_ARG_HF_FILE_V) |
174| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
175| `--log-disable` | Log disable |
176| `--log-file FNAME` | Log to file<br/>(env: LLAMA_LOG_FILE) |
177| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
178| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
179| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
180| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:<br/> - 0: generic output<br/> - 1: error<br/> - 2: warning<br/> - 3: info<br/> - 4: debug<br/>(default: 3)<br/><br/>(env: LLAMA_LOG_VERBOSITY) |
181| `--log-prefix` | Enable prefix in log messages<br/>(env: LLAMA_LOG_PREFIX) |
182| `--log-timestamps` | Enable timestamps in log messages<br/>(env: LLAMA_LOG_TIMESTAMPS) |
183| `-ctkd, --cache-type-k-draft TYPE` | KV cache data type for K for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K_DRAFT) |
184| `-ctvd, --cache-type-v-draft TYPE` | KV cache data type for V for the draft model<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V_DRAFT) |
185
186
187### Sampling params
188
189| Argument | Explanation |
190| -------- | ----------- |
191| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
192| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
193| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
194| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
195| `--temp N` | temperature (default: 0.80) |
196| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
197| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
198| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
199| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
200| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
201| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
202| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
203| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
204| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
205| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
206| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) |
207| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) |
208| `--dry-base N` | set DRY sampling base value (default: 1.75) |
209| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
210| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
211| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers |
212| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) |
213| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.<br/>(valid range 0.0 to 0.99) (default: 0.90) |
214| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) |
215| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) |
216| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
217| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) |
218| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) |
219| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
220| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
221| `--grammar-file FNAME` | file to read grammar from |
222| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
223| `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
224| `-bs, --backend-sampling` | enable backend sampling (experimental) (default: disabled)<br/>(env: LLAMA_ARG_BACKEND_SAMPLING) |
225
226
227### Completion-specific params
228
229| Argument | Explanation |
230| -------- | ----------- |
231| `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) |
232| `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal |
233| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
234| `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) |
235| `-sysf, --system-prompt-file FNAME` | a file containing the system prompt (default: none) |
236| `-ptc, --print-token-count N` | print token count every N tokens (default: -1) |
237| `--prompt-cache FNAME` | file to cache prompt state for faster startup (default: none) |
238| `--prompt-cache-all` | if specified, saves user input and generations to cache as well |
239| `--prompt-cache-ro` | if specified, uses the prompt cache but does not update it |
240| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
241| `-sp, --special` | special tokens output enabled (default: false) |
242| `-cnv, --conversation, -no-cnv, --no-conversation` | whether to run in conversation mode:<br/>- does not print special tokens and suffix/prefix<br/>- interactive mode is also enabled<br/>(default: auto enabled if chat template is available) |
243| `-st, --single-turn` | run conversation for a single turn only, then exit when done<br/>will not be interactive if first turn is predefined with --prompt<br/>(default: false) |
244| `-i, --interactive` | run in interactive mode (default: false) |
245| `-if, --interactive-first` | run in interactive mode and wait for input right away (default: false) |
246| `-mli, --multiline-input` | allows you to write or paste multiple lines without ending each in '\' |
247| `--in-prefix-bos` | prefix BOS to user inputs, preceding the `--in-prefix` string |
248| `--in-prefix STRING` | string to prefix user inputs with (default: empty) |
249| `--in-suffix STRING` | string to suffix after user inputs with (default: empty) |
250| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
251| `-gan, --grp-attn-n N` | group-attention factor (default: 1)<br/>(env: LLAMA_ARG_GRP_ATTN_N) |
252| `-gaw, --grp-attn-w N` | group-attention width (default: 512)<br/>(env: LLAMA_ARG_GRP_ATTN_W) |
253| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
254| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
255| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
256| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
257| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
258| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
259
260<!-- HELP_END -->
261
262## Common Options
263
264In this section, we cover the most commonly used options for running the `llama-completion` program with the LLaMA models:
265
266- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set).
267- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
268- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
269- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
270- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 4096, but if a LLaMA model was built with a longer context, increasing this value will provide better results for longer input/inference.
271- `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
272- `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
273- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
274
275## Input Prompts
276
277The `llama-completion` program provides several ways to interact with the LLaMA models using input prompts:
278
279- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
280- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
281- `--system-prompt PROMPT`: Provide a system prompt (will otherwise use the default one in the chat template (if provided)).
282- `--system-prompt-file FNAME`: Provide a file containing a system prompt.
283- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
284
285## Interaction
286
287The `llama-completion` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
288
289In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
290
291### Interaction Options
292
293- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
294- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
295- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default or provided chat template) (default: true if chat template found)
296- `-no-cnv`: Disable conversation mode (default: false)
297- `-st, --single-turn`: Only process a single conversation turn (user input) and then exit.
298- `--jinja`: Enable jinja chat template parser, will use the model's built-in template or a user-provided one (default: false)
299- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
300
301By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
302
303### Reverse Prompts
304
305Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered:
306
307- `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space.
308
309To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt.
310
311### In-Prefix
312
313The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
314
315```sh
316./llama-completion -r "User:" --in-prefix " "
317```
318
319### In-Suffix
320
321The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
322
323```sh
324./llama-completion -r "User:" --in-prefix " " --in-suffix "Assistant:"
325```
326When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled
327
328### Chat templates
329
330 `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name. Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
331
332 Example usage: `--chat-template gemma`
333
334`--chat-template-file FNAME`: Load a custom jinja chat template from an external file, useful if the model contains outdated or incompatible template, some examples can be found in models/templates. Up-to-date chat templates can be downloaded from Hugging Face using scripts/get_chat_template.py
335
336## Context Management
337
338During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
339
340### Context Size
341
342- `-c N, --ctx-size N`: Set the size of the prompt context (default: 4096, 0 = loaded from model). If a LLaMA model was built with a longer context, increasing this value will yield the best results on longer input/inference.
343
344### Extended Context Size
345
346Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
347
348- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
349
350### Keep Prompt
351
352The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
353
354- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
355
356By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
357
358## Generation Flags
359
360The following options allow you to control the text generation process and fine-tune the diversity, creativity, and quality of the generated text according to your needs. By adjusting these options and experimenting with different combinations of values, you can find the best settings for your specific use case.
361
362### Number of Tokens to Predict
363
364- `-n N, --predict N`: Set the number of tokens to predict when generating text (default: -1, -1 = infinity, -2 = until context filled)
365
366The `--predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
367
368A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in a significant pause in output.
369
370If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
371
372The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full.
373
374It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
375
376### Temperature
377
378- `--temp N`: Adjust the randomness of the generated text (default: 0.8).
379
380Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
381
382Example usage: `--temp 0`
383
384### Repeat Penalty
385
386- `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled).
387- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
388
389The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.
390
391The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
392
393### DRY Repetition Penalty
394
395DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)).
396
397- `--dry-multiplier N`: Set the DRY sampling multiplier (default: 0.0, 0.0 = disabled).
398- `--dry-base N`: Set the DRY sampling base value (default: 1.75).
399- `--dry-allowed-length N`: Set the allowed length for DRY sampling (default: 2).
400- `--dry-penalty-last-n N`: Set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size).
401- `--dry-sequence-breaker STRING`: Add a sequence breaker for DRY sampling. Can be used more than once to add multiple sequence breakers. Using this clears out the default breakers, which consist of: `['\n', ':', '"', '*']`. If the string `"none"` is supplied, no sequence breakers are used.
402
403The `dry-multiplier` option controls the strength of the DRY sampling effect. A value of 0.0 disables DRY sampling, while higher values increase its influence. A typical recommended value is 0.8.
404
405The `dry-base` option sets the base value for the exponential penalty calculation in DRY sampling. Higher values lead to more aggressive penalization of repetitions.
406
407The `dry-allowed-length` option sets the maximum length of repeated sequences that will not be penalized. Repetitions shorter than or equal to this length are not penalized, allowing for natural repetitions of short phrases or common words.
408
409The `dry-penalty-last-n` option controls how many recent tokens to consider when applying the DRY penalty. A value of -1 considers the entire context. Use a positive value to limit the consideration to a specific number of recent tokens.
410
411The `dry-sequence-breaker` option adds a single sequence breaker and can be used more than once to specify multiple sequence breakers. Sequence breakers interrupt sequence matching and break the input into parts where matching can be applied.
412
413DRY sampling provides more nuanced control over text generation, particularly for reducing long-range repetitions and maintaining global coherence.
414
415Example usage: `--dry-multiplier 0.8 --dry-base 1.75 --dry-allowed-length 2 --dry-penalty-last-n -1 --dry-sequence-breaker "—" --dry-sequence-breaker "##"`
416
417### Top-K Sampling
418
419- `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).
420
421Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
422
423Example usage: `--top-k 30`
424
425### Top-P Sampling
426
427- `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
428
429Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
430
431Example usage: `--top-p 0.95`
432
433### Min-P Sampling
434
435- `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.1).
436
437The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
438
439Example usage: `--min-p 0.05`
440
441### Adaptive-P Sampling
442
443- `--adaptive-target N`: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
444- `--adaptive-decay N`: EMA decay for adaptation; history ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99)
445
446Adaptive-P: Select tokens near a configurable target probability over time.
447
448The adaptive-p sampler transforms the token probability distribution to favor tokens that fall near a user-configurable probability target. Internally, the sampler maintains an exponential moving average of the *ORIGINAL* probabilities of selected tokens at each sampling step. It uses this EMA to compute an adapted target probability at each sampling step, thus maintaining the desired target probability over time. Only mild truncation before this sampler is recommended. It is suggested to apply min-p before adaptive-p as the only other active sampler.
449
450Recommended starting values: `--adaptive-target 0.55 --adaptive-decay 0.9`
451
452For more info, refer to: [llama.cpp#17927](https://github.com/ggml-org/llama.cpp/pull/17927)
453
454### Locally Typical Sampling
455
456- `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
457
458Locally typical sampling promotes the generation of contextually coherent and diverse text by sampling tokens that are typical or expected based on the surrounding context. By setting the parameter p between 0 and 1, you can control the balance between producing text that is locally coherent and diverse. A value closer to 1 will promote more contextually coherent tokens, while a value closer to 0 will promote more diverse tokens. A value equal to 1 disables locally typical sampling.
459
460Example usage: `--typical 0.9`
461
462### Mirostat Sampling
463
464- `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
465- `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
466- `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
467
468Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).
469
470The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
471
472The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
473
474Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
475
476### XTC Sampling
477
478- `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
479- `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
480
481Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
482
483By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
484
485Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
486
487Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`
488
489### Top-nσ Sampling
490
491- `--top-nsigma N`: Limit the next token selection to a subset of tokens with pre-softmax logits that are within n * σ less than the max logit (default: -1, -1 = disabled).
492
493Top-nσ sampling is a text generation method that selects tokens based on a statistical threshold in pre-softmax logits. It works by only sampling from tokens with logits that are within n * σ of the maximum logit. This method helps maintain a stable sampling space regardless of temperature scaling, allowing it to perform well on reasoning tasks even in high temperatures. Without complex probability manipulation, it efficiently filters tokens directly on the pre-softmax logits. A higher value for top-nsigma (e.g., 5) will take more noisy tokens into consideration, while a lower value (e.g., 1) will focous on the more informative region of the sampling space.
494
495Example usage: `--top-nsigma 1`
496
497### Logit Bias
498
499- `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
500
501The logit bias option allows you to manually adjust the likelihood of specific tokens appearing in the generated text. By providing a token ID and a positive or negative bias value, you can increase or decrease the probability of that token being generated.
502
503For example, use `--logit-bias 15043+1` to increase the likelihood of the token 'Hello', or `--logit-bias 15043-1` to decrease its likelihood. Using a value of negative infinity, `--logit-bias 15043-inf` ensures that the token `Hello` is never produced.
504
505A more practical use case might be to prevent the generation of `\code{begin}` and `\code{end}` by setting the `\` token (29905) to negative infinity with `-l 29905-inf`. (This is due to the prevalence of LaTeX codes that show up in LLaMA model inference.)
506
507Example usage: `--logit-bias 29905-inf`
508
509### RNG Seed
510
511- `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).
512
513The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
514
515## Performance Tuning and Memory Options
516
517These options help improve the performance and memory usage of the LLaMA models. By adjusting these settings, you can fine-tune the model's behavior to better suit your system's capabilities and achieve optimal performance for your specific use case.
518
519### Number of Threads
520
521- `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
522- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. In some systems, it is beneficial to use a higher number of threads during batch processing than during generation. If not specified, the number of threads used for batch processing will be the same as the number of threads used for generation.
523
524### Mlock
525
526- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. This can improve performance but trades away some of the advantages of memory-mapping by requiring more RAM to run and potentially slowing down load times as the model loads into RAM.
527
528### No Memory Mapping
529
530- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
531
532### NUMA support
533
534- `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
535- `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
536- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
537
538 These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
539
540### Batch Size
541
542- `-ub N`, `--ubatch-size N`: Physical batch size. This is the maximum number of tokens that may be processed at a time. Increasing this value may improve performance during prompt processing, at the expense of higher memory usage. Default: `512`.
543
544- `-b N`, `--batch-size N`: Logical batch size. Increasing this value above the value of the physical batch size may improve prompt processing performance when using multiple GPUs with pipeline parallelism. Default: `2048`.
545
546### Prompt Caching
547
548- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
549
550### Grammars & JSON schemas
551
552- `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
553
554- `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead.
555
556### Quantization
557
558For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
559
560## LoRA (Low-Rank Adaptation) adapters
561
562- `--lora FNAME`: Optional path to a LoRA adapter to use with scaling of 1.0. Can be mixed with `--lora-scaled` and can be repeated to use multiple adapters.
563- `--lora-scaled FNAME`: Optional path to a LoRA adapter with user-defined scaling. Can be mixed with `--lora` and can repeated to use multiple adapters.
564
565You can add LoRA adapters using `--lora` or `--lora-scaled`. For example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...` or `--lora-scaled lora_task_A.gguf 0.5 --lora-scaled lora_task_B.gguf 0.5`.
566
567LoRA adapters should be in GGUF format. To convert from Hugging Face format use the `convert-lora-to-gguf.py` script. LoRA adapters are loaded separately and applied during inference - they are not merged with the main model. This means that mmap model loading is fully supported when using LoRA adapters. The old `--lora-base` flag has been removed now that merging is no longer performed.
568
569## Additional Options
570
571These options provide extra functionality and customization when running the LLaMA models:
572
573- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
574- `--verbose-prompt`: Print the prompt before generating text.
575- `--no-display-prompt`: Don't print prompt at generation.
576- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
577- `-ts SPLIT, --tensor-split SPLIT`: When using multiple devices this option controls how tensors should be split across devices. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each device should get in order. For example, "3,2" will assign 60% of the data to device 0 and 40% to device 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance. The list of the devices which are being used is printed on startup and can be different from the device list given by `--list-devices` or e.g. `nvidia-smi`.
578- `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache.
diff --git a/llama.cpp/tools/completion/completion.cpp b/llama.cpp/tools/completion/completion.cpp
new file mode 100644
index 0000000..9771327
--- /dev/null
+++ b/llama.cpp/tools/completion/completion.cpp
@@ -0,0 +1,1001 @@
1#include "arg.h"
2#include "common.h"
3#include "console.h"
4#include "log.h"
5#include "sampling.h"
6#include "llama.h"
7#include "chat.h"
8
9#include <cstdio>
10#include <cstring>
11#include <ctime>
12#include <fstream>
13#include <iostream>
14#include <sstream>
15#include <string>
16#include <vector>
17
18#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
19#include <signal.h>
20#include <unistd.h>
21#elif defined (_WIN32)
22#define WIN32_LEAN_AND_MEAN
23#ifndef NOMINMAX
24#define NOMINMAX
25#endif
26#include <windows.h>
27#include <signal.h>
28#endif
29
30#if defined(_MSC_VER)
31#pragma warning(disable: 4244 4267) // possible loss of data
32#endif
33
34static llama_context ** g_ctx;
35static llama_model ** g_model;
36static common_sampler ** g_smpl;
37static common_params * g_params;
38static std::vector<llama_token> * g_input_tokens;
39static std::ostringstream * g_output_ss;
40static std::vector<llama_token> * g_output_tokens;
41static bool is_interacting = false;
42static bool need_insert_eot = false;
43
44static void print_usage(int argc, char ** argv) {
45 (void) argc;
46
47 LOG("\nexample usage:\n");
48 LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]);
49 LOG("\n chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]);
50 LOG("\n");
51}
52
53static bool file_exists(const std::string & path) {
54 std::ifstream f(path.c_str());
55 return f.good();
56}
57
58static bool file_is_empty(const std::string & path) {
59 std::ifstream f;
60 f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
61 f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
62 return f.tellg() == 0;
63}
64
65#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
66static void sigint_handler(int signo) {
67 if (signo == SIGINT) {
68 if (!is_interacting && g_params->interactive) {
69 is_interacting = true;
70 need_insert_eot = true;
71 } else {
72 console::cleanup();
73 LOG("\n");
74 common_perf_print(*g_ctx, *g_smpl);
75
76 // make sure all logs are flushed
77 LOG("Interrupted by user\n");
78 common_log_pause(common_log_main());
79
80 _exit(130);
81 }
82 }
83}
84#endif
85
86int main(int argc, char ** argv) {
87 common_params params;
88 g_params = &params;
89
90 if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
91 return 1;
92 }
93
94 common_init();
95
96 auto & sparams = params.sampling;
97
98 // save choice to use color for later
99 // (note for later: this is a slightly awkward choice)
100 console::init(params.simple_io, params.use_color);
101 atexit([]() { console::cleanup(); });
102
103 if (params.embedding) {
104 LOG_ERR("************\n");
105 LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
106 LOG_ERR("************\n\n");
107
108 return 0;
109 }
110
111 if (params.n_ctx != 0 && params.n_ctx < 8) {
112 LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
113 params.n_ctx = 8;
114 }
115
116 if (params.rope_freq_base != 0.0) {
117 LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
118 }
119
120 if (params.rope_freq_scale != 0.0) {
121 LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
122 }
123
124 LOG_INF("%s: llama backend init\n", __func__);
125
126 llama_backend_init();
127 llama_numa_init(params.numa);
128
129 llama_model * model = nullptr;
130 llama_context * ctx = nullptr;
131 common_sampler * smpl = nullptr;
132
133 g_model = &model;
134 g_ctx = &ctx;
135 g_smpl = &smpl;
136
137 std::vector<common_chat_msg> chat_msgs;
138
139 // load the model and apply lora adapter, if any
140 LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
141
142 auto llama_init = common_init_from_params(params);
143
144 ctx = llama_init->context();
145 model = llama_init->model();
146 smpl = llama_init->sampler(0);
147
148 if (ctx == NULL) {
149 LOG_ERR("%s: error: unable to create context\n", __func__);
150 return 1;
151 }
152
153 llama_memory_t mem = llama_get_memory(ctx);
154 const llama_vocab * vocab = llama_model_get_vocab(model);
155
156 // note: the time for chat template initialization is not negligible:
157 auto chat_templates = common_chat_templates_init(model, params.chat_template);
158
159 // start measuring performance timings from here
160 llama_perf_context_reset(ctx);
161
162 LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
163
164 auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
165 if (!cpu_dev) {
166 LOG_ERR("%s: no CPU backend found\n", __func__);
167 return 1;
168 }
169 auto * reg = ggml_backend_dev_backend_reg(cpu_dev);
170 auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
171 auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
172
173 struct ggml_threadpool_params tpp_batch =
174 ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
175 struct ggml_threadpool_params tpp =
176 ggml_threadpool_params_from_cpu_params(params.cpuparams);
177
178 if (!set_process_priority(params.cpuparams.priority)) {
179 LOG_ERR("%s: error: failed to set process priority\n", __func__);
180 return 1;
181 }
182
183 struct ggml_threadpool * threadpool_batch = NULL;
184 if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
185 threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
186 if (!threadpool_batch) {
187 LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
188 return 1;
189 }
190
191 // start the non-batch threadpool in the paused state
192 tpp.paused = true;
193 }
194
195 struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
196 if (!threadpool) {
197 LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
198 return 1;
199 }
200
201 llama_attach_threadpool(ctx, threadpool, threadpool_batch);
202
203 const int n_ctx_train = llama_model_n_ctx_train(model);
204 const int n_ctx = llama_n_ctx(ctx);
205
206 if (n_ctx > n_ctx_train) {
207 LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
208 }
209
210 // auto enable conversation mode if chat template is available
211 const bool has_chat_template = common_chat_templates_was_explicit(chat_templates.get());
212 if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
213 if (has_chat_template) {
214 LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
215 params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
216 } else {
217 params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
218 }
219 }
220
221 // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning
222 if (params.conversation_mode && !has_chat_template) {
223 LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__);
224 }
225
226 // print chat template example in conversation mode
227 if (params.conversation_mode) {
228 if (params.enable_chat_template) {
229 if (!params.prompt.empty() && params.system_prompt.empty()) {
230 LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n");
231 }
232
233 LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs).c_str());
234 } else {
235 LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
236 }
237 }
238
239 // print system information
240 {
241 LOG_INF("\n");
242 LOG_INF("%s\n", common_params_get_system_info(params).c_str());
243 LOG_INF("\n");
244 }
245
246 std::string path_session = params.path_prompt_cache;
247 std::vector<llama_token> session_tokens;
248
249 if (!path_session.empty()) {
250 LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
251 if (!file_exists(path_session)) {
252 LOG_INF("%s: session file does not exist, will create.\n", __func__);
253 } else if (file_is_empty(path_session)) {
254 LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
255 } else {
256 // The file exists and is not empty
257 session_tokens.resize(n_ctx);
258 size_t n_token_count_out = 0;
259 if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
260 LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
261 return 1;
262 }
263 session_tokens.resize(n_token_count_out);
264 LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
265 }
266 }
267
268 const bool add_bos = llama_vocab_get_add_bos(vocab) && !params.use_jinja;
269 if (!llama_model_has_encoder(model)) {
270 GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
271 }
272
273 LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
274
275 std::vector<llama_token> embd_inp;
276
277 bool waiting_for_first_input = false;
278 auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
279 common_chat_msg new_msg;
280 new_msg.role = role;
281 new_msg.content = content;
282 auto formatted = common_chat_format_single(chat_templates.get(), chat_msgs, new_msg, role == "user", g_params->use_jinja);
283 chat_msgs.push_back(new_msg);
284 LOG_DBG("formatted: '%s'\n", formatted.c_str());
285 return formatted;
286 };
287
288 std::string prompt;
289 {
290 if (params.conversation_mode && params.enable_chat_template) {
291 if (!params.system_prompt.empty()) {
292 // format the system prompt (will use template default if empty)
293 chat_add_and_format("system", params.system_prompt);
294 }
295
296 if (!params.prompt.empty()) {
297 // format and append the user prompt
298 chat_add_and_format("user", params.prompt);
299 } else {
300 waiting_for_first_input = true;
301 }
302
303 if (!params.system_prompt.empty() || !params.prompt.empty()) {
304 common_chat_templates_inputs inputs;
305 inputs.use_jinja = g_params->use_jinja;
306 inputs.messages = chat_msgs;
307 inputs.add_generation_prompt = !params.prompt.empty();
308
309 prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt;
310 }
311 } else {
312 // otherwise use the prompt as is
313 prompt = params.prompt;
314 }
315
316 if (params.interactive_first || !prompt.empty() || session_tokens.empty()) {
317 LOG_DBG("tokenize the prompt\n");
318 embd_inp = common_tokenize(ctx, prompt, true, true);
319 } else {
320 LOG_DBG("use session tokens\n");
321 embd_inp = session_tokens;
322 }
323
324 LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
325 LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
326 }
327
328 // Should not run without any tokens
329 if (!waiting_for_first_input && embd_inp.empty()) {
330 if (add_bos) {
331 embd_inp.push_back(llama_vocab_bos(vocab));
332 LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
333 } else {
334 LOG_ERR("input is empty\n");
335 return -1;
336 }
337 }
338
339 // Tokenize negative prompt
340 if ((int) embd_inp.size() > n_ctx - 4) {
341 LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
342 return 1;
343 }
344
345 bool session_do_save = false;
346
347 {
348 size_t n_match = 0;
349
350 if (!session_tokens.empty()) {
351 for (llama_token id : session_tokens) {
352 if (n_match >= embd_inp.size() || id != embd_inp[n_match]) {
353 break;
354 }
355 n_match++;
356 }
357 if (params.prompt.empty() && n_match == embd_inp.size()) {
358 LOG_INF("%s: using full prompt from session file\n", __func__);
359 } else if (n_match >= embd_inp.size()) {
360 LOG_INF("%s: session file has exact match for prompt!\n", __func__);
361 } else if (n_match < (embd_inp.size() / 2)) {
362 LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
363 __func__, n_match, embd_inp.size());
364 } else {
365 LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
366 __func__, n_match, embd_inp.size());
367 }
368
369 if (session_tokens.size() == n_match) {
370 // [TAG_CONTEXT_STATE_LOGITS]
371 // in this case, we are going to reuse the logits from the session
372 // if we ever decide to remove the logits from the session, we need to handle this somehow
373 // ref: https://github.com/ggml-org/llama.cpp/pull/18862#issuecomment-3756330941
374 }
375
376 // remove any "future" tokens that we might have inherited from the previous session
377 if (session_tokens.size() > n_match) {
378 if (!llama_memory_seq_rm(mem, -1, n_match, -1)) {
379 LOG_WRN("%s: unable to resuse common prefix (for example, when the memory is recurrent)\n", __func__);
380 llama_memory_clear(mem, true);
381 session_tokens.clear();
382 n_match = 0;
383 } else {
384 session_tokens.resize(n_match);
385 }
386 }
387 }
388
389 session_do_save = !path_session.empty() && n_match < embd_inp.size() && !params.prompt_cache_ro;
390 }
391
392 // number of tokens to keep when resetting context
393 if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
394 params.n_keep = (int)embd_inp.size();
395 } else {
396 params.n_keep += add_bos; // always keep the BOS token
397 }
398
399 if (params.conversation_mode) {
400 if (params.single_turn && !params.prompt.empty()) {
401 params.interactive = false;
402 params.interactive_first = false;
403 } else {
404 params.interactive_first = true;
405 }
406 }
407
408 // enable interactive mode if interactive start is specified
409 if (params.interactive_first) {
410 params.interactive = true;
411 }
412
413 if (params.verbose_prompt) {
414 LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
415 LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
416 for (int i = 0; i < (int) embd_inp.size(); i++) {
417 LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
418 }
419
420 if (params.n_keep > add_bos) {
421 LOG_INF("%s: static prompt based on n_keep: '", __func__);
422 for (int i = 0; i < params.n_keep; i++) {
423 LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
424 }
425 LOG_CNT("'\n");
426 }
427 LOG_INF("\n");
428 }
429
430 // ctrl+C handling
431 {
432#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
433 struct sigaction sigint_action;
434 sigint_action.sa_handler = sigint_handler;
435 sigemptyset (&sigint_action.sa_mask);
436 sigint_action.sa_flags = 0;
437 sigaction(SIGINT, &sigint_action, NULL);
438#elif defined (_WIN32)
439 auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
440 return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
441 };
442 SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
443#endif
444 }
445
446 if (params.interactive) {
447 LOG_INF("%s: interactive mode on.\n", __func__);
448
449 if (!params.antiprompt.empty()) {
450 for (const auto & antiprompt : params.antiprompt) {
451 LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
452 if (params.verbose_prompt) {
453 auto tmp = common_tokenize(ctx, antiprompt, false, true);
454 for (int i = 0; i < (int) tmp.size(); i++) {
455 LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
456 }
457 }
458 }
459 }
460
461 if (params.input_prefix_bos) {
462 LOG_INF("Input prefix with BOS\n");
463 }
464
465 if (!params.input_prefix.empty()) {
466 LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
467 if (params.verbose_prompt) {
468 auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
469 for (int i = 0; i < (int) tmp.size(); i++) {
470 LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
471 }
472 }
473 }
474
475 if (!params.input_suffix.empty()) {
476 LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
477 if (params.verbose_prompt) {
478 auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
479 for (int i = 0; i < (int) tmp.size(); i++) {
480 LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
481 }
482 }
483 }
484 }
485
486 LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
487 LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
488 LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
489
490 LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
491
492 // group-attention state
493 // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
494 int ga_i = 0;
495
496 const int ga_n = params.grp_attn_n;
497 const int ga_w = params.grp_attn_w;
498
499 if (ga_n != 1) {
500 GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT
501 GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
502 //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
503 //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
504 LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
505 }
506 LOG_INF("\n");
507
508 if (params.interactive) {
509 const char * control_message;
510 if (params.multiline_input) {
511 control_message = " - To return control to the AI, end your input with '\\'.\n"
512 " - To return control without starting a new line, end your input with '/'.\n";
513 } else {
514 control_message = " - Press Return to return control to the AI.\n"
515 " - To return control without starting a new line, end your input with '/'.\n"
516 " - If you want to submit another line, end your input with '\\'.\n";
517 }
518 LOG_INF("== Running in interactive mode. ==\n");
519#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
520 LOG_INF( " - Press Ctrl+C to interject at any time.\n");
521#endif
522 LOG_INF( "%s", control_message);
523 if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) {
524 LOG_INF( " - Not using system message. To change it, set a different value via -sys PROMPT\n");
525 }
526 LOG_INF("\n");
527
528 is_interacting = params.interactive_first;
529 }
530
531 bool is_antiprompt = false;
532 bool input_echo = true;
533 bool display = true;
534
535 int n_past = 0;
536 int n_remain = params.n_predict;
537 int n_consumed = 0;
538 int n_session_consumed = 0;
539
540 std::vector<int> input_tokens; g_input_tokens = &input_tokens;
541 std::vector<int> output_tokens; g_output_tokens = &output_tokens;
542 std::ostringstream output_ss; g_output_ss = &output_ss;
543 std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
544
545 // the first thing we will do is to output the prompt, so set color accordingly
546 console::set_display(DISPLAY_TYPE_PROMPT);
547 display = params.display_prompt;
548
549 std::vector<llama_token> embd;
550
551 // single-token antiprompts
552 std::vector<llama_token> antiprompt_token;
553
554 for (const std::string & antiprompt : params.antiprompt) {
555 auto ids = ::common_tokenize(ctx, antiprompt, false, true);
556 if (ids.size() == 1) {
557 antiprompt_token.push_back(ids[0]);
558 }
559 }
560
561 if (llama_model_has_encoder(model)) {
562 int enc_input_size = embd_inp.size();
563 llama_token * enc_input_buf = embd_inp.data();
564
565 if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
566 LOG_ERR("%s : failed to eval\n", __func__);
567 return 1;
568 }
569
570 llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
571 if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
572 decoder_start_token_id = llama_vocab_bos(vocab);
573 }
574
575 embd_inp.clear();
576 embd_inp.push_back(decoder_start_token_id);
577 }
578
579 while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
580 // predict
581 if (!embd.empty()) {
582 // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
583 // --prompt or --file which uses the same value.
584 int max_embd_size = n_ctx - 4;
585
586 // Ensure the input doesn't exceed the context size by truncating embd if necessary.
587 if ((int) embd.size() > max_embd_size) {
588 const int skipped_tokens = (int) embd.size() - max_embd_size;
589 embd.resize(max_embd_size);
590
591 console::set_display(DISPLAY_TYPE_ERROR);
592 LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
593 console::set_display(DISPLAY_TYPE_RESET);
594 }
595
596 if (ga_n == 1) {
597 // infinite text generation via context shifting
598 // if we run out of context:
599 // - take the n_keep first tokens from the original prompt (via n_past)
600 // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
601
602 if (n_past + (int) embd.size() >= n_ctx) {
603 if (!params.ctx_shift){
604 LOG_WRN("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
605 break;
606 }
607
608 if (params.n_predict == -2) {
609 LOG_WRN("\n\n%s: context full and n_predict == %d => stopping\n", __func__, params.n_predict);
610 break;
611 }
612
613 const int n_left = n_past - params.n_keep;
614 const int n_discard = n_left/2;
615
616 LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
617 n_past, n_left, n_ctx, params.n_keep, n_discard);
618
619 llama_memory_seq_rm (mem, 0, params.n_keep , params.n_keep + n_discard);
620 llama_memory_seq_add(mem, 0, params.n_keep + n_discard, n_past, -n_discard);
621
622 n_past -= n_discard;
623
624 LOG_DBG("after swap: n_past = %d\n", n_past);
625
626 LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
627
628 LOG_DBG("clear session path\n");
629 path_session.clear();
630 }
631 } else {
632 // context extension via Self-Extend
633 while (n_past >= ga_i + ga_w) {
634 const int ib = (ga_n*ga_i)/ga_w;
635 const int bd = (ga_w/ga_n)*(ga_n - 1);
636 const int dd = (ga_w/ga_n) - ib*bd - ga_w;
637
638 LOG_DBG("\n");
639 LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
640 LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
641 LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
642
643 llama_memory_seq_add(mem, 0, ga_i, n_past, ib*bd);
644 llama_memory_seq_div(mem, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
645 llama_memory_seq_add(mem, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
646
647 n_past -= bd;
648
649 ga_i += ga_w/ga_n;
650
651 LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
652 }
653 }
654
655 // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
656 if (n_session_consumed < (int) session_tokens.size()) {
657 size_t i = 0;
658 for ( ; i < embd.size(); i++) {
659 if (embd[i] != session_tokens[n_session_consumed]) {
660 session_tokens.resize(n_session_consumed);
661 break;
662 }
663
664 n_past++;
665 n_session_consumed++;
666
667 if (n_session_consumed >= (int) session_tokens.size()) {
668 ++i;
669 break;
670 }
671 }
672 if (i > 0) {
673 embd.erase(embd.begin(), embd.begin() + i);
674 }
675 }
676
677 if (!embd.empty()) {
678 int n_eval = (int) embd.size();
679 LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
680
681 GGML_ASSERT(n_eval <= params.n_batch);
682 if (llama_decode(ctx, llama_batch_get_one(embd.data(), n_eval))) {
683 LOG_ERR("%s : failed to eval\n", __func__);
684 return 1;
685 }
686
687 n_past += n_eval;
688
689 LOG_DBG("n_past = %d\n", n_past);
690 // Display total tokens alongside total time
691 if (params.n_print > 0 && n_past % params.n_print == 0) {
692 LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
693 }
694 }
695
696 if (!embd.empty() && !path_session.empty()) {
697 session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
698 n_session_consumed = session_tokens.size();
699 }
700 }
701
702 embd.clear();
703
704 if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
705 // optionally save the session on first sample (for faster prompt loading next time)
706 if (session_do_save) {
707 session_do_save = false;
708 llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
709
710 LOG_DBG("saved session to %s\n", path_session.c_str());
711 }
712
713 const llama_token id = common_sampler_sample(smpl, ctx, -1);
714
715 common_sampler_accept(smpl, id, /* accept_grammar= */ true);
716
717 // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
718
719 embd.push_back(id);
720
721 if (params.conversation_mode && !waiting_for_first_input && !llama_vocab_is_eog(vocab, id)) {
722 assistant_ss << common_token_to_piece(ctx, id, false);
723 }
724
725 // echo this to console
726 input_echo = true;
727
728 // decrement remaining sampling budget
729 --n_remain;
730
731 LOG_DBG("n_remain: %d\n", n_remain);
732 } else {
733 // some user input remains from prompt or interaction, forward it to processing
734 LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
735 while ((int) embd_inp.size() > n_consumed) {
736 embd.push_back(embd_inp[n_consumed]);
737
738 // push the prompt in the sampling context in order to apply repetition penalties later
739 // for the prompt, we don't apply grammar rules
740 common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
741
742 ++n_consumed;
743 if ((int) embd.size() == params.n_batch) {
744 break;
745 }
746 }
747 }
748
749 // display text
750 if (input_echo && display) {
751 for (auto id : embd) {
752 const std::string token_str = common_token_to_piece(ctx, id, params.special);
753
754 // Console/Stream Output
755 LOG("%s", token_str.c_str());
756
757 // Record Displayed Tokens To Log
758 // Note: Generated tokens are created one by one hence this check
759 if (embd.size() > 1) {
760 // Incoming Requested Tokens
761 input_tokens.push_back(id);
762 } else {
763 // Outgoing Generated Tokens
764 output_tokens.push_back(id);
765 output_ss << token_str;
766 }
767 }
768 }
769
770 // reset color to default if there is no pending user input
771 if (input_echo && (int) embd_inp.size() == n_consumed) {
772 console::set_display(DISPLAY_TYPE_RESET);
773 display = true;
774 }
775
776 // if not currently processing queued inputs;
777 if ((int) embd_inp.size() <= n_consumed) {
778 // check for reverse prompt in the last n_prev tokens
779 if (!params.antiprompt.empty()) {
780 const int n_prev = 32;
781 const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
782
783 is_antiprompt = false;
784 // Check if each of the reverse prompts appears at the end of the output.
785 // If we're not running interactively, the reverse prompt might be tokenized with some following characters
786 // so we'll compensate for that by widening the search window a bit.
787 for (std::string & antiprompt : params.antiprompt) {
788 size_t extra_padding = params.interactive ? 0 : 2;
789 size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
790 ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
791 : 0;
792
793 if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
794 if (params.interactive) {
795 is_interacting = true;
796 }
797 is_antiprompt = true;
798 break;
799 }
800 }
801
802 // check for reverse prompt using special tokens
803 // avoid calling common_sampler_last() if last_output is empty
804 if (!last_output.empty()) {
805 llama_token last_token = common_sampler_last(smpl);
806 for (auto token : antiprompt_token) {
807 if (token == last_token) {
808 if (params.interactive) {
809 is_interacting = true;
810 }
811 is_antiprompt = true;
812 break;
813 }
814 }
815 }
816
817 if (is_antiprompt) {
818 LOG_DBG("found antiprompt: %s\n", last_output.c_str());
819 }
820 }
821
822 // deal with end of generation tokens in interactive mode
823 if (!waiting_for_first_input && llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
824 LOG_DBG("found an EOG token\n");
825
826 if (params.interactive) {
827 if (!params.antiprompt.empty()) {
828 // tokenize and inject first reverse prompt
829 const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
830 embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
831 is_antiprompt = true;
832 }
833
834 if (params.enable_chat_template) {
835 chat_add_and_format("assistant", assistant_ss.str());
836 }
837 is_interacting = true;
838 LOG("\n");
839 }
840 }
841
842 if (params.conversation_mode && !waiting_for_first_input) {
843 if (!prompt.empty()) {
844 prompt.clear();
845 is_interacting = false;
846 }
847 }
848
849 if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
850 LOG_DBG("waiting for user input\n");
851
852 if (params.conversation_mode) {
853 LOG("\n> ");
854 }
855
856 if (params.input_prefix_bos) {
857 LOG_DBG("adding input prefix BOS token\n");
858 embd_inp.push_back(llama_vocab_bos(vocab));
859 }
860
861 std::string buffer;
862 if (!params.input_prefix.empty() && !params.conversation_mode) {
863 LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
864 LOG("%s", params.input_prefix.c_str());
865 }
866
867 // color user input only
868 console::set_display(DISPLAY_TYPE_USER_INPUT);
869 display = params.display_prompt;
870
871 std::string line;
872 bool another_line = true;
873 do {
874 another_line = console::readline(line, params.multiline_input);
875 buffer += line;
876 } while (another_line);
877
878 // done taking input, reset color
879 console::set_display(DISPLAY_TYPE_RESET);
880 display = true;
881
882 if (buffer.empty()) { // Ctrl+D on empty line exits
883 LOG("EOF by user\n");
884 break;
885 }
886
887 if (buffer.back() == '\n') {
888 // Implement #587:
889 // If the user wants the text to end in a newline,
890 // this should be accomplished by explicitly adding a newline by using \ followed by return,
891 // then returning control by pressing return again.
892 buffer.pop_back();
893 }
894
895 if (buffer.empty()) { // Enter key on empty line lets the user pass control back
896 LOG_DBG("empty line, passing control back\n");
897 } else { // Add tokens to embd only if the input buffer is non-empty
898 // append input suffix if any
899 if (!params.input_suffix.empty() && !params.conversation_mode) {
900 LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
901 LOG("%s", params.input_suffix.c_str());
902 }
903
904 LOG_DBG("buffer: '%s'\n", buffer.c_str());
905
906 const size_t original_size = embd_inp.size();
907
908 if (params.escape) {
909 string_process_escapes(buffer);
910 }
911
912 bool format_chat = params.conversation_mode && params.enable_chat_template;
913 std::string user_inp = format_chat
914 ? chat_add_and_format("user", std::move(buffer))
915 : std::move(buffer);
916 // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
917 const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
918 const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat);
919 const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
920
921 LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
922
923 // if user stop generation mid-way, we must add EOT to finish model's last response
924 if (need_insert_eot && format_chat) {
925 llama_token eot = llama_vocab_eot(vocab);
926 embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot);
927 need_insert_eot = false;
928 }
929
930 embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
931 embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
932 embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
933
934 if (params.verbose_prompt) {
935 LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size() - original_size);
936 }
937
938 for (size_t i = original_size; i < embd_inp.size(); ++i) {
939 const llama_token token = embd_inp[i];
940 const std::string token_str = common_token_to_piece(ctx, token);
941 output_tokens.push_back(token);
942 output_ss << token_str;
943
944 if (params.verbose_prompt) {
945 LOG_INF("%6d -> '%s'\n", token, token_str.c_str());
946 }
947 }
948
949 // reset assistant message
950 assistant_ss.str("");
951
952 n_remain -= line_inp.size();
953 LOG_DBG("n_remain: %d\n", n_remain);
954 }
955
956 input_echo = false; // do not echo this again
957 }
958
959 if (n_past > 0 || waiting_for_first_input) {
960 if (is_interacting) {
961 common_sampler_reset(smpl);
962 }
963 is_interacting = false;
964
965 if (waiting_for_first_input && params.single_turn) {
966 params.interactive = false;
967 params.interactive_first = false;
968 }
969 waiting_for_first_input = false;
970 }
971 }
972
973 // end of generation
974 if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) {
975 LOG(" [end of text]\n");
976 break;
977 }
978
979 // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
980 // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
981 if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
982 n_remain = params.n_predict;
983 is_interacting = true;
984 }
985 }
986
987 if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
988 LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
989 llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
990 }
991
992 LOG("\n\n");
993 common_perf_print(ctx, smpl);
994
995 llama_backend_free();
996
997 ggml_threadpool_free_fn(threadpool);
998 ggml_threadpool_free_fn(threadpool_batch);
999
1000 return 0;
1001}