1#include "arg.h"
2
3#include "chat.h"
4#include "common.h"
5#include "download.h"
6#include "json-schema-to-grammar.h"
7#include "log.h"
8#include "sampling.h"
9#include "speculative.h"
10#include "preset.h"
11
12// fix problem with std::min and std::max
13#if defined(_WIN32)
14#define WIN32_LEAN_AND_MEAN
15#ifndef NOMINMAX
16# define NOMINMAX
17#endif
18#include <windows.h>
19#endif
20
21#define JSON_ASSERT GGML_ASSERT
22#include <nlohmann/json.hpp>
23
24#include <algorithm>
25#include <cinttypes>
26#include <climits>
27#include <cstdarg>
28#include <fstream>
29#include <list>
30#include <regex>
31#include <set>
32#include <string>
33#include <thread> // for hardware_concurrency
34#include <vector>
35
36#ifndef __EMSCRIPTEN__
37#ifdef __linux__
38#include <linux/limits.h>
39#elif defined(_WIN32)
40# if !defined(PATH_MAX)
41# define PATH_MAX MAX_PATH
42# endif
43#elif defined(_AIX)
44#include <sys/limits.h>
45#else
46#include <sys/syslimits.h>
47#endif
48#endif
49
50#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
51
52extern const char * LICENSES[];
53
54using json = nlohmann::ordered_json;
55using namespace common_arg_utils;
56
57static std::initializer_list<enum llama_example> mmproj_examples = {
58 LLAMA_EXAMPLE_MTMD,
59 LLAMA_EXAMPLE_SERVER,
60 LLAMA_EXAMPLE_CLI,
61};
62
63static std::string read_file(const std::string & fname) {
64 std::ifstream file(fname);
65 if (!file) {
66 throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
67 }
68 std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
69 file.close();
70 return content;
71}
72
73static const std::vector<common_arg> & get_common_arg_defs() {
74 static const std::vector<common_arg> options = [] {
75 common_params params;
76 auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
77 return ctx.options;
78 }();
79 return options;
80}
81
82common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
83 this->examples = examples;
84 return *this;
85}
86
87common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
88 this->excludes = excludes;
89 return *this;
90}
91
92common_arg & common_arg::set_env(const char * env) {
93 help = help + "\n(env: " + env + ")";
94 this->env = env;
95 return *this;
96}
97
98common_arg & common_arg::set_sparam() {
99 is_sparam = true;
100 return *this;
101}
102
103common_arg & common_arg::set_preset_only() {
104 is_preset_only = true;
105 return *this;
106}
107
108bool common_arg::in_example(enum llama_example ex) {
109 return examples.find(ex) != examples.end();
110}
111
112bool common_arg::is_exclude(enum llama_example ex) {
113 return excludes.find(ex) != excludes.end();
114}
115
116bool common_arg::get_value_from_env(std::string & output) const {
117 if (env == nullptr) return false;
118 if (!args_neg.empty()) {
119 // for compatibility, we need to check LLAMA_ARG_NO_ env as well
120 std::string neg_env = env;
121 string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
122 char * neg_value = std::getenv(neg_env.c_str());
123 if (neg_value) {
124 output = "0"; // falsey
125 return true;
126 }
127 }
128 char * value = std::getenv(env);
129 if (value) {
130 output = value;
131 return true;
132 }
133 return false;
134}
135
136bool common_arg::has_value_from_env() const {
137 if (env != nullptr && !args_neg.empty()) {
138 // for compatibility, we need to check LLAMA_ARG_NO_ env as well
139 std::string neg_env = env;
140 string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
141 if (std::getenv(neg_env.c_str())) {
142 return true;
143 }
144 }
145 return env != nullptr && std::getenv(env);
146}
147
148static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
149 std::vector<std::string> result;
150 std::istringstream iss(input);
151 std::string line;
152 auto add_line = [&](const std::string& l) {
153 if (l.length() <= max_char_per_line) {
154 result.push_back(l);
155 } else {
156 std::istringstream line_stream(l);
157 std::string word, current_line;
158 while (line_stream >> word) {
159 if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
160 if (!current_line.empty()) result.push_back(current_line);
161 current_line = word;
162 } else {
163 current_line += (!current_line.empty() ? " " : "") + word;
164 }
165 }
166 if (!current_line.empty()) result.push_back(current_line);
167 }
168 };
169 while (std::getline(iss, line)) {
170 add_line(line);
171 }
172 return result;
173}
174
175std::string common_arg::to_string() const {
176 // params for printing to console
177 const static int n_leading_spaces = 40;
178 const static int n_char_per_line_help = 70; // TODO: detect this based on current console
179 std::string leading_spaces(n_leading_spaces, ' ');
180
181 std::ostringstream ss;
182 auto all_args = get_args(); // also contains args_neg
183 for (const auto & arg : all_args) {
184 if (arg == all_args.front()) {
185 if (all_args.size() == 1) {
186 ss << arg;
187 } else {
188 // first arg is usually abbreviation, we need padding to make it more beautiful
189 auto tmp = std::string(arg) + ", ";
190 auto spaces = std::string(std::max(0, 7 - (int)tmp.size()), ' ');
191 ss << tmp << spaces;
192 }
193 } else {
194 ss << arg << (arg != all_args.back() ? ", " : "");
195 }
196 }
197 if (value_hint) ss << " " << value_hint;
198 if (value_hint_2) ss << " " << value_hint_2;
199 if (ss.tellp() > n_leading_spaces - 3) {
200 // current line is too long, add new line
201 ss << "\n" << leading_spaces;
202 } else {
203 // padding between arg and help, same line
204 ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
205 }
206 const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
207 for (const auto & line : help_lines) {
208 ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
209 }
210 return ss.str();
211}
212
213std::vector<std::string> common_arg::get_args() const {
214 std::vector<std::string> result;
215 for (const auto & arg : args) {
216 result.push_back(std::string(arg));
217 }
218 for (const auto & arg : args_neg) {
219 result.push_back(std::string(arg));
220 }
221 return result;
222}
223
224std::vector<std::string> common_arg::get_env() const {
225 std::vector<std::string> result;
226 if (env) {
227 result.push_back(std::string(env));
228 }
229 if (!args_neg.empty() && env) {
230 // for compatibility, we need to add LLAMA_ARG_NO_ variant
231 std::string neg_env = env;
232 string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
233 result.push_back(neg_env);
234 }
235 return result;
236}
237
238//
239// utils
240//
241
242// Helper function to parse tensor buffer override strings
243static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
244 std::map<std::string, ggml_backend_buffer_type_t> buft_list;
245 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
246 auto * dev = ggml_backend_dev_get(i);
247 auto * buft = ggml_backend_dev_buffer_type(dev);
248 if (buft) {
249 buft_list[ggml_backend_buft_name(buft)] = buft;
250 }
251 }
252
253 for (const auto & override : string_split<std::string>(value, ',')) {
254 std::string::size_type pos = override.find('=');
255 if (pos == std::string::npos) {
256 throw std::invalid_argument("invalid value");
257 }
258 std::string tensor_name = override.substr(0, pos);
259 std::string buffer_type = override.substr(pos + 1);
260
261 if (buft_list.find(buffer_type) == buft_list.end()) {
262 printf("Available buffer types:\n");
263 for (const auto & it : buft_list) {
264 printf(" %s\n", ggml_backend_buft_name(it.second));
265 }
266 throw std::invalid_argument("unknown buffer type");
267 }
268 // keep strings alive and avoid leaking memory by storing them in a static vector
269 static std::list<std::string> buft_overrides;
270 buft_overrides.push_back(tensor_name);
271 overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
272 }
273}
274
275static std::string clean_file_name(const std::string & fname) {
276 std::string clean_fname = fname;
277 string_replace_all(clean_fname, "\\", "_");
278 string_replace_all(clean_fname, "/", "_");
279 return clean_fname;
280}
281
282static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
283 GGML_ASSERT(!params.model.hf_repo.empty());
284
285 // the returned hf_repo is without tag
286 auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
287
288 // "latest" tag (default if not specified) is translated to "default" preset
289 if (hf_tag == "latest") {
290 hf_tag = "default";
291 }
292
293 const bool offline = params.offline;
294 std::string model_endpoint = get_model_endpoint();
295 auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
296
297 // prepare local path for caching
298 auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
299 auto preset_path = fs_get_cache_file(preset_fname);
300 const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
301 const bool has_preset = status >= 200 && status < 400;
302
303 // remote preset is optional, so we don't error out if not found
304 if (has_preset) {
305 LOG_INF("applying remote preset from %s\n", preset_url.c_str());
306 common_preset_context ctx(ex, /* only_remote_allowed */ true);
307 common_preset global;
308 auto remote_presets = ctx.load_from_ini(preset_path, global);
309 remote_presets = ctx.cascade(global, remote_presets);
310 if (remote_presets.find(hf_tag) != remote_presets.end()) {
311 common_preset preset = remote_presets.at(hf_tag);
312 LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
313 preset.apply_to_params(params);
314 } else {
315 throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
316 }
317 } else {
318 LOG_INF("%s", "no remote preset found, skipping\n");
319 }
320
321 return has_preset;
322}
323
324struct handle_model_result {
325 bool found_mmproj = false;
326 common_params_model mmproj;
327};
328
329static handle_model_result common_params_handle_model(
330 struct common_params_model & model,
331 const std::string & bearer_token,
332 bool offline) {
333 handle_model_result result;
334 // handle pre-fill default model path and url based on hf_repo and hf_file
335 {
336 if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
337 model.path = common_docker_resolve_model(model.docker_repo);
338 model.name = model.docker_repo; // set name for consistency
339 } else if (!model.hf_repo.empty()) {
340 // short-hand to avoid specifying --hf-file -> default it to --model
341 if (model.hf_file.empty()) {
342 if (model.path.empty()) {
343 auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
344 if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
345 exit(1); // error message already printed
346 }
347 model.name = model.hf_repo; // repo name with tag
348 model.hf_repo = auto_detected.repo; // repo name without tag
349 model.hf_file = auto_detected.ggufFile;
350 if (!auto_detected.mmprojFile.empty()) {
351 result.found_mmproj = true;
352 result.mmproj.hf_repo = model.hf_repo;
353 result.mmproj.hf_file = auto_detected.mmprojFile;
354 }
355 } else {
356 model.hf_file = model.path;
357 }
358 }
359
360 std::string model_endpoint = get_model_endpoint();
361 model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
362 // make sure model path is present (for caching purposes)
363 if (model.path.empty()) {
364 // this is to avoid different repo having same file name, or same file name in different subdirs
365 std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
366 model.path = fs_get_cache_file(filename);
367 }
368
369 } else if (!model.url.empty()) {
370 if (model.path.empty()) {
371 auto f = string_split<std::string>(model.url, '#').front();
372 f = string_split<std::string>(f, '?').front();
373 model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
374 }
375
376 }
377 }
378
379 // then, download it if needed
380 if (!model.url.empty()) {
381 bool ok = common_download_model(model, bearer_token, offline);
382 if (!ok) {
383 LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
384 exit(1);
385 }
386 }
387
388 return result;
389}
390
391const std::vector<ggml_type> kv_cache_types = {
392 GGML_TYPE_F32,
393 GGML_TYPE_F16,
394 GGML_TYPE_BF16,
395 GGML_TYPE_Q8_0,
396 GGML_TYPE_Q4_0,
397 GGML_TYPE_Q4_1,
398 GGML_TYPE_IQ4_NL,
399 GGML_TYPE_Q5_0,
400 GGML_TYPE_Q5_1,
401};
402
403static ggml_type kv_cache_type_from_str(const std::string & s) {
404 for (const auto & type : kv_cache_types) {
405 if (ggml_type_name(type) == s) {
406 return type;
407 }
408 }
409 throw std::runtime_error("Unsupported cache type: " + s);
410}
411
412static std::string get_all_kv_cache_types() {
413 std::ostringstream msg;
414 for (const auto & type : kv_cache_types) {
415 msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
416 }
417 return msg.str();
418}
419
420static bool parse_bool_value(const std::string & value) {
421 if (is_truthy(value)) {
422 return true;
423 } else if (is_falsey(value)) {
424 return false;
425 } else {
426 throw std::invalid_argument("invalid boolean value");
427 }
428}
429
430//
431// CLI argument parsing functions
432//
433
434static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
435 common_params & params = ctx_arg.params;
436
437 std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
438 for (auto & opt : ctx_arg.options) {
439 for (const auto & arg : opt.args) {
440 arg_to_options[arg] = {&opt, /* is_positive */ true};
441 }
442 for (const auto & arg : opt.args_neg) {
443 arg_to_options[arg] = {&opt, /* is_positive */ false};
444 }
445 }
446
447 // handle environment variables
448 for (auto & opt : ctx_arg.options) {
449 std::string value;
450 if (opt.get_value_from_env(value)) {
451 try {
452 if (opt.handler_void && is_truthy(value)) {
453 opt.handler_void(params);
454 }
455 if (opt.handler_int) {
456 opt.handler_int(params, std::stoi(value));
457 }
458 if (opt.handler_bool) {
459 opt.handler_bool(params, parse_bool_value(value));
460 }
461 if (opt.handler_string) {
462 opt.handler_string(params, value);
463 continue;
464 }
465 } catch (std::exception & e) {
466 throw std::invalid_argument(string_format(
467 "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
468 }
469 }
470 }
471
472 // handle command line arguments
473 auto check_arg = [&](int i) {
474 if (i+1 >= argc) {
475 throw std::invalid_argument("expected value for argument");
476 }
477 };
478
479 auto parse_cli_args = [&]() {
480 std::set<std::string> seen_args;
481
482 for (int i = 1; i < argc; i++) {
483 const std::string arg_prefix = "--";
484
485 std::string arg = argv[i];
486 if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
487 std::replace(arg.begin(), arg.end(), '_', '-');
488 }
489 if (arg_to_options.find(arg) == arg_to_options.end()) {
490 throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
491 }
492 if (!seen_args.insert(arg).second) {
493 LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
494 }
495 auto & tmp = arg_to_options[arg];
496 auto opt = *tmp.first;
497 bool is_positive = tmp.second;
498 if (opt.has_value_from_env()) {
499 fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
500 }
501 try {
502 if (opt.handler_void) {
503 opt.handler_void(params);
504 continue;
505 }
506 if (opt.handler_bool) {
507 opt.handler_bool(params, is_positive);
508 continue;
509 }
510
511 // arg with single value
512 check_arg(i);
513 std::string val = argv[++i];
514 if (opt.handler_int) {
515 opt.handler_int(params, std::stoi(val));
516 continue;
517 }
518 if (opt.handler_string) {
519 opt.handler_string(params, val);
520 continue;
521 }
522
523 // arg with 2 values
524 check_arg(i);
525 std::string val2 = argv[++i];
526 if (opt.handler_str_str) {
527 opt.handler_str_str(params, val, val2);
528 continue;
529 }
530 } catch (std::exception & e) {
531 throw std::invalid_argument(string_format(
532 "error while handling argument \"%s\": %s\n\n"
533 "usage:\n%s\n\nto show complete usage, run with -h",
534 arg.c_str(), e.what(), opt.to_string().c_str()));
535 }
536 }
537 };
538
539 // parse the first time to get -hf option (used for remote preset)
540 parse_cli_args();
541
542 // maybe handle remote preset
543 if (!params.model.hf_repo.empty()) {
544 std::string cli_hf_repo = params.model.hf_repo;
545 bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
546
547 // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
548 // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
549 std::string preset_hf_repo = params.model.hf_repo;
550 bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
551
552 if (has_preset) {
553 // re-parse CLI args to override preset values
554 parse_cli_args();
555 }
556
557 // preserve hf_repo from preset if needed
558 if (preset_has_hf_repo) {
559 params.model.hf_repo = preset_hf_repo;
560 }
561 }
562
563 postprocess_cpu_params(params.cpuparams, nullptr);
564 postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
565
566 postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams);
567 postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch);
568
569 if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
570 throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
571 }
572
573 // handle model and download
574 {
575 auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
576 if (params.no_mmproj) {
577 params.mmproj = {};
578 } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
579 // optionally, handle mmproj model when -hf is specified
580 params.mmproj = res.mmproj;
581 }
582 // only download mmproj if the current example is using it
583 for (const auto & ex : mmproj_examples) {
584 if (ctx_arg.ex == ex) {
585 common_params_handle_model(params.mmproj, params.hf_token, params.offline);
586 break;
587 }
588 }
589 common_params_handle_model(params.speculative.mparams_dft, params.hf_token, params.offline);
590 common_params_handle_model(params.vocoder.model, params.hf_token, params.offline);
591 }
592
593 // model is required (except for server)
594 // TODO @ngxson : maybe show a list of available models in CLI in this case
595 if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
596 throw std::invalid_argument("error: --model is required\n");
597 }
598
599 if (params.escape) {
600 string_process_escapes(params.prompt);
601 string_process_escapes(params.input_prefix);
602 string_process_escapes(params.input_suffix);
603 for (auto & antiprompt : params.antiprompt) {
604 string_process_escapes(antiprompt);
605 }
606 for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
607 string_process_escapes(seq_breaker);
608 }
609 for (auto & pair : params.speculative.replacements) {
610 string_process_escapes(pair.first);
611 string_process_escapes(pair.second);
612 }
613 }
614
615 if (!params.kv_overrides.empty()) {
616 params.kv_overrides.emplace_back();
617 params.kv_overrides.back().key[0] = 0;
618 }
619
620 // pad tensor_buft_overrides for llama_params_fit:
621 const size_t ntbo = llama_max_tensor_buft_overrides();
622 while (params.tensor_buft_overrides.size() < ntbo) {
623 params.tensor_buft_overrides.push_back({nullptr, nullptr});
624 }
625
626 if (!params.speculative.tensor_buft_overrides.empty()) {
627 params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
628 }
629
630 if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
631 throw std::runtime_error(string_format(
632 "error: the supplied chat template is not supported: %s%s\n",
633 params.chat_template.c_str(),
634 params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates"
635 ));
636 }
637
638 common_log_set_verbosity_thold(params.verbosity);
639
640 return true;
641}
642
643static void common_params_print_usage(common_params_context & ctx_arg) {
644 auto print_options = [](std::vector<common_arg *> & options) {
645 for (common_arg * opt : options) {
646 printf("%s", opt->to_string().c_str());
647 }
648 };
649
650 std::vector<common_arg *> common_options;
651 std::vector<common_arg *> sparam_options;
652 std::vector<common_arg *> specific_options;
653 for (auto & opt : ctx_arg.options) {
654 // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
655 if (opt.is_sparam) {
656 sparam_options.push_back(&opt);
657 } else if (opt.in_example(ctx_arg.ex)) {
658 specific_options.push_back(&opt);
659 } else {
660 common_options.push_back(&opt);
661 }
662 }
663 printf("----- common params -----\n\n");
664 print_options(common_options);
665 printf("\n\n----- sampling params -----\n\n");
666 print_options(sparam_options);
667 // TODO: maybe convert enum llama_example to string
668 printf("\n\n----- example-specific params -----\n\n");
669 print_options(specific_options);
670}
671
672static void common_params_print_completion(common_params_context & ctx_arg) {
673 std::vector<common_arg *> common_options;
674 std::vector<common_arg *> sparam_options;
675 std::vector<common_arg *> specific_options;
676
677 for (auto & opt : ctx_arg.options) {
678 if (opt.is_sparam) {
679 sparam_options.push_back(&opt);
680 } else if (opt.in_example(ctx_arg.ex)) {
681 specific_options.push_back(&opt);
682 } else {
683 common_options.push_back(&opt);
684 }
685 }
686
687 printf("_llama_completions() {\n");
688 printf(" local cur prev opts\n");
689 printf(" COMPREPLY=()\n");
690 printf(" cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
691 printf(" prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
692
693 printf(" opts=\"");
694 auto print_options = [](const std::vector<common_arg *> & options) {
695 for (const common_arg * opt : options) {
696 for (const char * arg : opt->args) {
697 printf("%s ", arg);
698 }
699 }
700 };
701
702 print_options(common_options);
703 print_options(sparam_options);
704 print_options(specific_options);
705 printf("\"\n\n");
706
707 printf(" case \"$prev\" in\n");
708 printf(" --model|-m)\n");
709 printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
710 printf(" return 0\n");
711 printf(" ;;\n");
712 printf(" --grammar-file)\n");
713 printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
714 printf(" return 0\n");
715 printf(" ;;\n");
716 printf(" --chat-template-file)\n");
717 printf(" COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
718 printf(" return 0\n");
719 printf(" ;;\n");
720 printf(" *)\n");
721 printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
722 printf(" return 0\n");
723 printf(" ;;\n");
724 printf(" esac\n");
725 printf("}\n\n");
726
727 std::set<std::string> executables = {
728 "llama-batched",
729 "llama-batched-bench",
730 "llama-bench",
731 "llama-cli",
732 "llama-completion",
733 "llama-convert-llama2c-to-ggml",
734 "llama-cvector-generator",
735 "llama-embedding",
736 "llama-eval-callback",
737 "llama-export-lora",
738 "llama-gen-docs",
739 "llama-gguf",
740 "llama-gguf-hash",
741 "llama-gguf-split",
742 "llama-gritlm",
743 "llama-imatrix",
744 "llama-infill",
745 "llama-mtmd-cli",
746 "llama-llava-clip-quantize-cli",
747 "llama-lookahead",
748 "llama-lookup",
749 "llama-lookup-create",
750 "llama-lookup-merge",
751 "llama-lookup-stats",
752 "llama-parallel",
753 "llama-passkey",
754 "llama-perplexity",
755 "llama-q8dot",
756 "llama-quantize",
757 "llama-qwen2vl-cli",
758 "llama-retrieval",
759 "llama-save-load-state",
760 "llama-server",
761 "llama-simple",
762 "llama-simple-chat",
763 "llama-speculative",
764 "llama-speculative-simple",
765 "llama-tokenize",
766 "llama-tts",
767 "llama-vdot"
768 };
769
770 for (const auto& exe : executables) {
771 printf("complete -F _llama_completions %s\n", exe.c_str());
772 }
773}
774
775static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
776 std::vector<ggml_backend_dev_t> devices;
777 auto dev_names = string_split<std::string>(value, ',');
778 if (dev_names.empty()) {
779 throw std::invalid_argument("no devices specified");
780 }
781 if (dev_names.size() == 1 && dev_names[0] == "none") {
782 devices.push_back(nullptr);
783 } else {
784 for (const auto & device : dev_names) {
785 auto * dev = ggml_backend_dev_by_name(device.c_str());
786 if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
787 throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
788 }
789 devices.push_back(dev);
790 }
791 devices.push_back(nullptr);
792 }
793 return devices;
794}
795
796static void add_rpc_devices(const std::string & servers) {
797 auto rpc_servers = string_split<std::string>(servers, ',');
798 if (rpc_servers.empty()) {
799 throw std::invalid_argument("no RPC servers specified");
800 }
801 ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
802 if (!rpc_reg) {
803 throw std::invalid_argument("failed to find RPC backend");
804 }
805 typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint);
806 ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
807 if (!ggml_backend_rpc_add_server_fn) {
808 throw std::invalid_argument("failed to find RPC add server function");
809 }
810 for (const auto & server : rpc_servers) {
811 auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
812 ggml_backend_register(reg);
813 }
814}
815
816bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
817 common_params dummy_params;
818 common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
819
820 std::unordered_map<std::string, common_arg *> arg_to_options;
821 for (auto & opt : ctx_arg.options) {
822 for (const auto & arg : opt.args) {
823 arg_to_options[arg] = &opt;
824 }
825 for (const auto & arg : opt.args_neg) {
826 arg_to_options[arg] = &opt;
827 }
828 }
829
830 // TODO @ngxson : find a way to deduplicate this code
831
832 // handle command line arguments
833 auto check_arg = [&](int i) {
834 if (i+1 >= argc) {
835 throw std::invalid_argument("expected value for argument");
836 }
837 };
838
839 std::set<std::string> seen_args;
840
841 for (int i = 1; i < argc; i++) {
842 const std::string arg_prefix = "--";
843
844 std::string arg = argv[i];
845 if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
846 std::replace(arg.begin(), arg.end(), '_', '-');
847 }
848 if (arg_to_options.find(arg) == arg_to_options.end()) {
849 throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
850 }
851 if (!seen_args.insert(arg).second) {
852 LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
853 }
854 auto opt = *arg_to_options[arg];
855 std::string val;
856 if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
857 // bool arg (need to reverse the meaning for negative args)
858 bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
859 val = is_neg ? "0" : "1";
860 }
861 if (opt.value_hint != nullptr) {
862 // arg with single value
863 check_arg(i);
864 val = argv[++i];
865 }
866 if (opt.value_hint_2 != nullptr) {
867 // TODO: support arg with 2 values
868 throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
869 }
870 out_map[opt] = val;
871 }
872
873 return true;
874}
875
876bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
877 auto ctx_arg = common_params_parser_init(params, ex, print_usage);
878 const common_params params_org = ctx_arg.params; // the example can modify the default params
879
880 try {
881 if (!common_params_parse_ex(argc, argv, ctx_arg)) {
882 ctx_arg.params = params_org;
883 return false;
884 }
885 if (ctx_arg.params.usage) {
886 common_params_print_usage(ctx_arg);
887 if (ctx_arg.print_usage) {
888 ctx_arg.print_usage(argc, argv);
889 }
890 exit(0);
891 }
892 if (ctx_arg.params.completion) {
893 common_params_print_completion(ctx_arg);
894 exit(0);
895 }
896 params.lr.init();
897 } catch (const std::invalid_argument & ex) {
898 fprintf(stderr, "%s\n", ex.what());
899 ctx_arg.params = params_org;
900 return false;
901 } catch (std::exception & ex) {
902 fprintf(stderr, "%s\n", ex.what());
903 exit(1); // for other exceptions, we exit with status code 1
904 }
905
906 return true;
907}
908
909static std::string list_builtin_chat_templates() {
910 std::vector<const char *> supported_tmpl;
911 int32_t res = llama_chat_builtin_templates(nullptr, 0);
912 supported_tmpl.resize(res);
913 res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
914 std::ostringstream msg;
915 for (auto & tmpl : supported_tmpl) {
916 msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
917 }
918 return msg.str();
919}
920
921bool common_arg_utils::is_truthy(const std::string & value) {
922 return value == "on" || value == "enabled" || value == "true" || value == "1";
923}
924
925bool common_arg_utils::is_falsey(const std::string & value) {
926 return value == "off" || value == "disabled" || value == "false" || value == "0";
927}
928
929bool common_arg_utils::is_autoy(const std::string & value) {
930 return value == "auto" || value == "-1";
931}
932
933// Simple CSV parser that handles quoted fields and escaped quotes
934// example:
935// input: value1,"value, with, commas","value with ""escaped"" quotes",value4
936// output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
937static std::vector<std::string> parse_csv_row(const std::string& input) {
938 std::vector<std::string> fields;
939 std::string field;
940 bool in_quotes = false;
941
942 for (size_t i = 0; i < input.length(); ++i) {
943 char ch = input[i];
944
945 if (ch == '"') {
946 if (!in_quotes) {
947 // start of quoted field (only valid if at beginning of field)
948 if (!field.empty()) {
949 // quote appeared in middle of unquoted field, treat as literal
950 field += '"';
951 } else {
952 in_quotes = true; // start
953 }
954 } else {
955 if (i + 1 < input.length() && input[i + 1] == '"') {
956 // escaped quote: ""
957 field += '"';
958 ++i; // skip the next quote
959 } else {
960 in_quotes = false; // end
961 }
962 }
963 } else if (ch == ',') {
964 if (in_quotes) {
965 field += ',';
966 } else {
967 fields.push_back(std::move(field));
968 field.clear();
969 }
970 } else {
971 field += ch;
972 }
973 }
974
975 // Add the last field
976 fields.push_back(std::move(field));
977
978 return fields;
979}
980
981common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
982 // per-example default params
983 // we define here to make sure it's included in llama-gen-docs
984 if (ex == LLAMA_EXAMPLE_COMPLETION) {
985 params.use_jinja = false; // disable jinja by default
986
987 } else if (ex == LLAMA_EXAMPLE_MTMD) {
988 params.use_jinja = false; // disable jinja by default
989 params.sampling.temp = 0.2; // lower temp by default for better quality
990
991 } else if (ex == LLAMA_EXAMPLE_SERVER) {
992 params.n_parallel = -1; // auto by default
993 }
994
995 params.use_color = tty_can_use_colors();
996
997 // load dynamic backends
998 ggml_backend_load_all();
999
1000 common_params_context ctx_arg(params);
1001 ctx_arg.print_usage = print_usage;
1002 ctx_arg.ex = ex;
1003
1004 std::string sampler_type_chars;
1005 std::string sampler_type_names;
1006 for (const auto & sampler : params.sampling.samplers) {
1007 sampler_type_chars += common_sampler_type_to_chr(sampler);
1008 sampler_type_names += common_sampler_type_to_str(sampler) + ";";
1009 }
1010 if (!sampler_type_names.empty()) {
1011 sampler_type_names.pop_back(); // remove last semicolon
1012 }
1013
1014
1015 /**
1016 * filter options by example
1017 * rules:
1018 * - all examples inherit options from LLAMA_EXAMPLE_COMMON
1019 * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
1020 * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
1021 */
1022 auto add_opt = [&](common_arg arg) {
1023 if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
1024 ctx_arg.options.push_back(std::move(arg));
1025 }
1026 };
1027
1028
1029 add_opt(common_arg(
1030 {"-h", "--help", "--usage"},
1031 "print usage and exit",
1032 [](common_params & params) {
1033 params.usage = true;
1034 }
1035 ));
1036 add_opt(common_arg(
1037 {"--version"},
1038 "show version and build info",
1039 [](common_params &) {
1040 fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
1041 fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
1042 exit(0);
1043 }
1044 ));
1045 add_opt(common_arg(
1046 {"--license"},
1047 "show source code license and dependencies",
1048 [](common_params &) {
1049 for (int i = 0; LICENSES[i]; ++i) {
1050 printf("%s\n", LICENSES[i]);
1051 }
1052 exit(0);
1053 }
1054 ));
1055 add_opt(common_arg(
1056 {"-cl", "--cache-list"},
1057 "show list of models in cache",
1058 [](common_params &) {
1059 printf("model cache directory: %s\n", fs_get_cache_directory().c_str());
1060 auto models = common_list_cached_models();
1061 printf("number of models in cache: %zu\n", models.size());
1062 for (size_t i = 0; i < models.size(); i++) {
1063 auto & model = models[i];
1064 printf("%4d. %s\n", (int) i + 1, model.to_string().c_str());
1065 }
1066 exit(0);
1067 }
1068 ));
1069 add_opt(common_arg(
1070 {"--completion-bash"},
1071 "print source-able bash completion script for llama.cpp",
1072 [](common_params & params) {
1073 params.completion = true;
1074 }
1075 ));
1076 add_opt(common_arg(
1077 {"--verbose-prompt"},
1078 string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
1079 [](common_params & params) {
1080 params.verbose_prompt = true;
1081 }
1082 ));
1083 add_opt(common_arg(
1084 {"--display-prompt"},
1085 {"--no-display-prompt"},
1086 string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
1087 [](common_params & params, bool value) {
1088 params.display_prompt = value;
1089 }
1090 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1091 add_opt(common_arg(
1092 {"-co", "--color"}, "[on|off|auto]",
1093 "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
1094 "'auto' enables colors when output is to a terminal",
1095 [](common_params & params, const std::string & value) {
1096 if (is_truthy(value)) {
1097 params.use_color = true;
1098 } else if (is_falsey(value)) {
1099 params.use_color = false;
1100 } else if (is_autoy(value)) {
1101 params.use_color = tty_can_use_colors();
1102 } else {
1103 throw std::invalid_argument(
1104 string_format("error: unknown value for --color: '%s'\n", value.c_str()));
1105 }
1106 }
1107 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
1108 add_opt(common_arg(
1109 {"-t", "--threads"}, "N",
1110 string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
1111 [](common_params & params, int value) {
1112 params.cpuparams.n_threads = value;
1113 if (params.cpuparams.n_threads <= 0) {
1114 params.cpuparams.n_threads = std::thread::hardware_concurrency();
1115 }
1116 }
1117 ).set_env("LLAMA_ARG_THREADS"));
1118 add_opt(common_arg(
1119 {"-tb", "--threads-batch"}, "N",
1120 "number of threads to use during batch and prompt processing (default: same as --threads)",
1121 [](common_params & params, int value) {
1122 params.cpuparams_batch.n_threads = value;
1123 if (params.cpuparams_batch.n_threads <= 0) {
1124 params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
1125 }
1126 }
1127 ));
1128 add_opt(common_arg(
1129 {"-C", "--cpu-mask"}, "M",
1130 "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
1131 [](common_params & params, const std::string & mask) {
1132 params.cpuparams.mask_valid = true;
1133 if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
1134 throw std::invalid_argument("invalid cpumask");
1135 }
1136 }
1137 ));
1138 add_opt(common_arg(
1139 {"-Cr", "--cpu-range"}, "lo-hi",
1140 "range of CPUs for affinity. Complements --cpu-mask",
1141 [](common_params & params, const std::string & range) {
1142 params.cpuparams.mask_valid = true;
1143 if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
1144 throw std::invalid_argument("invalid range");
1145 }
1146 }
1147 ));
1148 add_opt(common_arg(
1149 {"--cpu-strict"}, "<0|1>",
1150 string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
1151 [](common_params & params, const std::string & value) {
1152 params.cpuparams.strict_cpu = std::stoul(value);
1153 }
1154 ));
1155 add_opt(common_arg(
1156 {"--prio"}, "N",
1157 string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
1158 [](common_params & params, int prio) {
1159 if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
1160 throw std::invalid_argument("invalid value");
1161 }
1162 params.cpuparams.priority = (enum ggml_sched_priority) prio;
1163 }
1164 ));
1165 add_opt(common_arg(
1166 {"--poll"}, "<0...100>",
1167 string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
1168 [](common_params & params, const std::string & value) {
1169 params.cpuparams.poll = std::stoul(value);
1170 }
1171 ));
1172 add_opt(common_arg(
1173 {"-Cb", "--cpu-mask-batch"}, "M",
1174 "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
1175 [](common_params & params, const std::string & mask) {
1176 params.cpuparams_batch.mask_valid = true;
1177 if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
1178 throw std::invalid_argument("invalid cpumask");
1179 }
1180 }
1181 ));
1182 add_opt(common_arg(
1183 {"-Crb", "--cpu-range-batch"}, "lo-hi",
1184 "ranges of CPUs for affinity. Complements --cpu-mask-batch",
1185 [](common_params & params, const std::string & range) {
1186 params.cpuparams_batch.mask_valid = true;
1187 if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
1188 throw std::invalid_argument("invalid range");
1189 }
1190 }
1191 ));
1192 add_opt(common_arg(
1193 {"--cpu-strict-batch"}, "<0|1>",
1194 "use strict CPU placement (default: same as --cpu-strict)",
1195 [](common_params & params, int value) {
1196 params.cpuparams_batch.strict_cpu = value;
1197 }
1198 ));
1199 add_opt(common_arg(
1200 {"--prio-batch"}, "N",
1201 string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
1202 [](common_params & params, int prio) {
1203 if (prio < 0 || prio > 3) {
1204 throw std::invalid_argument("invalid value");
1205 }
1206 params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
1207 }
1208 ));
1209 add_opt(common_arg(
1210 {"--poll-batch"}, "<0|1>",
1211 "use polling to wait for work (default: same as --poll)",
1212 [](common_params & params, int value) {
1213 params.cpuparams_batch.poll = value;
1214 }
1215 ));
1216 add_opt(common_arg(
1217 {"-lcs", "--lookup-cache-static"}, "FNAME",
1218 "path to static lookup cache to use for lookup decoding (not updated by generation)",
1219 [](common_params & params, const std::string & value) {
1220 params.speculative.lookup_cache_static = value;
1221 }
1222 ).set_examples({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
1223 add_opt(common_arg(
1224 {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
1225 "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
1226 [](common_params & params, const std::string & value) {
1227 params.speculative.lookup_cache_dynamic = value;
1228 }
1229 ).set_examples({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
1230 add_opt(common_arg(
1231 {"-c", "--ctx-size"}, "N",
1232 string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
1233 [](common_params & params, int value) {
1234 params.n_ctx = value;
1235 if (value == 0) {
1236 // disable context reduction in llama_params_fit if the user explicitly requests the full context size:
1237 params.fit_params_min_ctx = UINT32_MAX;
1238 }
1239 }
1240 ).set_env("LLAMA_ARG_CTX_SIZE"));
1241 add_opt(common_arg(
1242 {"-n", "--predict", "--n-predict"}, "N",
1243 string_format(
1244 ex == LLAMA_EXAMPLE_COMPLETION
1245 ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
1246 : "number of tokens to predict (default: %d, -1 = infinity)",
1247 params.n_predict),
1248 [](common_params & params, int value) {
1249 params.n_predict = value;
1250 }
1251 ).set_env("LLAMA_ARG_N_PREDICT"));
1252 add_opt(common_arg(
1253 {"-b", "--batch-size"}, "N",
1254 string_format("logical maximum batch size (default: %d)", params.n_batch),
1255 [](common_params & params, int value) {
1256 params.n_batch = value;
1257 }
1258 ).set_env("LLAMA_ARG_BATCH"));
1259 add_opt(common_arg(
1260 {"-ub", "--ubatch-size"}, "N",
1261 string_format("physical maximum batch size (default: %d)", params.n_ubatch),
1262 [](common_params & params, int value) {
1263 params.n_ubatch = value;
1264 }
1265 ).set_env("LLAMA_ARG_UBATCH"));
1266 add_opt(common_arg(
1267 {"--keep"}, "N",
1268 string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
1269 [](common_params & params, int value) {
1270 params.n_keep = value;
1271 }
1272 ));
1273 add_opt(common_arg(
1274 {"--swa-full"},
1275 string_format("use full-size SWA cache (default: %s)\n"
1276 "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
1277 [](common_params & params) {
1278 params.swa_full = true;
1279 }
1280 ).set_env("LLAMA_ARG_SWA_FULL"));
1281 add_opt(common_arg(
1282 {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
1283 string_format("max number of context checkpoints to create per slot (default: %d)"
1284 "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
1285 [](common_params & params, int value) {
1286 params.n_ctx_checkpoints = value;
1287 }
1288 ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
1289 add_opt(common_arg(
1290 {"-cram", "--cache-ram"}, "N",
1291 string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
1292 "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
1293 [](common_params & params, int value) {
1294 params.cache_ram_mib = value;
1295 }
1296 ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
1297 add_opt(common_arg(
1298 {"-kvu", "--kv-unified"},
1299 {"-no-kvu", "--no-kv-unified"},
1300 "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
1301 [](common_params & params, bool value) {
1302 params.kv_unified = value;
1303 }
1304 ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH}));
1305 add_opt(common_arg(
1306 {"--context-shift"},
1307 {"--no-context-shift"},
1308 string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
1309 [](common_params & params, bool value) {
1310 params.ctx_shift = value;
1311 }
1312 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
1313 add_opt(common_arg(
1314 {"--chunks"}, "N",
1315 string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
1316 [](common_params & params, int value) {
1317 params.n_chunks = value;
1318 }
1319 ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1320 add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
1321 string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
1322 llama_flash_attn_type_name(params.flash_attn_type)),
1323 [](common_params & params, const std::string & value) {
1324 if (is_truthy(value)) {
1325 params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1326 } else if (is_falsey(value)) {
1327 params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1328 } else if (is_autoy(value)) {
1329 params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1330 } else {
1331 throw std::runtime_error(
1332 string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
1333 }
1334 }).set_env("LLAMA_ARG_FLASH_ATTN"));
1335 add_opt(common_arg(
1336 {"-p", "--prompt"}, "PROMPT",
1337 "prompt to start generation with; for system message, use -sys",
1338 [](common_params & params, const std::string & value) {
1339 params.prompt = value;
1340 }
1341 ).set_excludes({LLAMA_EXAMPLE_SERVER}));
1342 add_opt(common_arg(
1343 {"-sys", "--system-prompt"}, "PROMPT",
1344 "system prompt to use with model (if applicable, depending on chat template)",
1345 [](common_params & params, const std::string & value) {
1346 params.system_prompt = value;
1347 }
1348 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
1349 add_opt(common_arg(
1350 {"--perf"},
1351 {"--no-perf"},
1352 string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
1353 [](common_params & params, bool value) {
1354 params.no_perf = !value;
1355 params.sampling.no_perf = !value;
1356 }
1357 ).set_env("LLAMA_ARG_PERF"));
1358 add_opt(common_arg(
1359 {"--show-timings"},
1360 {"--no-show-timings"},
1361 string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
1362 [](common_params & params, bool value) {
1363 params.show_timings = value;
1364 }
1365 ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
1366 add_opt(common_arg(
1367 {"-f", "--file"}, "FNAME",
1368 "a file containing the prompt (default: none)",
1369 [](common_params & params, const std::string & value) {
1370 params.prompt = read_file(value);
1371 // store the external file name in params
1372 params.prompt_file = value;
1373 if (!params.prompt.empty() && params.prompt.back() == '\n') {
1374 params.prompt.pop_back();
1375 }
1376 }
1377 ).set_excludes({LLAMA_EXAMPLE_SERVER}));
1378 add_opt(common_arg(
1379 {"-sysf", "--system-prompt-file"}, "FNAME",
1380 "a file containing the system prompt (default: none)",
1381 [](common_params & params, const std::string & value) {
1382 params.system_prompt = read_file(value);
1383 if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
1384 params.system_prompt.pop_back();
1385 }
1386 }
1387 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
1388 add_opt(common_arg(
1389 {"--in-file"}, "FNAME",
1390 "an input file (use comma-separated values to specify multiple files)",
1391 [](common_params & params, const std::string & value) {
1392 for (const auto & item : parse_csv_row(value)) {
1393 std::ifstream file(item);
1394 if (!file) {
1395 throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
1396 }
1397 params.in_files.push_back(item);
1398 }
1399 }
1400 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1401 add_opt(common_arg(
1402 {"-bf", "--binary-file"}, "FNAME",
1403 "binary file containing the prompt (default: none)",
1404 [](common_params & params, const std::string & value) {
1405 std::ifstream file(value, std::ios::binary);
1406 if (!file) {
1407 throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1408 }
1409 // store the external file name in params
1410 params.prompt_file = value;
1411 std::ostringstream ss;
1412 ss << file.rdbuf();
1413 params.prompt = ss.str();
1414 fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
1415 }
1416 ).set_excludes({LLAMA_EXAMPLE_SERVER}));
1417 add_opt(common_arg(
1418 {"-e", "--escape"},
1419 {"--no-escape"},
1420 string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
1421 [](common_params & params, bool value) {
1422 params.escape = value;
1423 }
1424 ));
1425 add_opt(common_arg(
1426 {"-ptc", "--print-token-count"}, "N",
1427 string_format("print token count every N tokens (default: %d)", params.n_print),
1428 [](common_params & params, int value) {
1429 params.n_print = value;
1430 }
1431 ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1432 add_opt(common_arg(
1433 {"--prompt-cache"}, "FNAME",
1434 "file to cache prompt state for faster startup (default: none)",
1435 [](common_params & params, const std::string & value) {
1436 params.path_prompt_cache = value;
1437 }
1438 ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1439 add_opt(common_arg(
1440 {"--prompt-cache-all"},
1441 "if specified, saves user input and generations to cache as well\n",
1442 [](common_params & params) {
1443 params.prompt_cache_all = true;
1444 }
1445 ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1446 add_opt(common_arg(
1447 {"--prompt-cache-ro"},
1448 "if specified, uses the prompt cache but does not update it",
1449 [](common_params & params) {
1450 params.prompt_cache_ro = true;
1451 }
1452 ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1453 add_opt(common_arg(
1454 {"-r", "--reverse-prompt"}, "PROMPT",
1455 "halt generation at PROMPT, return control in interactive mode\n",
1456 [](common_params & params, const std::string & value) {
1457 params.antiprompt.emplace_back(value);
1458 }
1459 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
1460 add_opt(common_arg(
1461 {"-sp", "--special"},
1462 string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
1463 [](common_params & params) {
1464 params.special = true;
1465 }
1466 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
1467 add_opt(common_arg(
1468 {"-cnv", "--conversation"},
1469 {"-no-cnv", "--no-conversation"},
1470 "whether to run in conversation mode:\n"
1471 "- does not print special tokens and suffix/prefix\n"
1472 "- interactive mode is also enabled\n"
1473 "(default: auto enabled if chat template is available)",
1474 [](common_params & params, bool value) {
1475 params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
1476 }
1477 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1478 add_opt(common_arg(
1479 {"-st", "--single-turn"},
1480 "run conversation for a single turn only, then exit when done\n"
1481 "will not be interactive if first turn is predefined with --prompt\n"
1482 "(default: false)",
1483 [](common_params & params) {
1484 params.single_turn = true;
1485 }
1486 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1487 add_opt(common_arg(
1488 {"-i", "--interactive"},
1489 string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
1490 [](common_params & params) {
1491 params.interactive = true;
1492 }
1493 ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1494 add_opt(common_arg(
1495 {"-if", "--interactive-first"},
1496 string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
1497 [](common_params & params) {
1498 params.interactive_first = true;
1499 }
1500 ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1501 add_opt(common_arg(
1502 {"-mli", "--multiline-input"},
1503 "allows you to write or paste multiple lines without ending each in '\\'",
1504 [](common_params & params) {
1505 params.multiline_input = true;
1506 }
1507 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1508 add_opt(common_arg(
1509 {"--in-prefix-bos"},
1510 "prefix BOS to user inputs, preceding the `--in-prefix` string",
1511 [](common_params & params) {
1512 params.input_prefix_bos = true;
1513 params.enable_chat_template = false;
1514 }
1515 ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1516 add_opt(common_arg(
1517 {"--in-prefix"}, "STRING",
1518 "string to prefix user inputs with (default: empty)",
1519 [](common_params & params, const std::string & value) {
1520 params.input_prefix = value;
1521 params.enable_chat_template = false;
1522 }
1523 ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1524 add_opt(common_arg(
1525 {"--in-suffix"}, "STRING",
1526 "string to suffix after user inputs with (default: empty)",
1527 [](common_params & params, const std::string & value) {
1528 params.input_suffix = value;
1529 params.enable_chat_template = false;
1530 }
1531 ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1532 add_opt(common_arg(
1533 {"--warmup"},
1534 {"--no-warmup"},
1535 string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
1536 [](common_params & params, bool value) {
1537 params.warmup = value;
1538 }
1539 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
1540 add_opt(common_arg(
1541 {"--spm-infill"},
1542 string_format(
1543 "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
1544 params.spm_infill ? "enabled" : "disabled"
1545 ),
1546 [](common_params & params) {
1547 params.spm_infill = true;
1548 }
1549 ).set_examples({LLAMA_EXAMPLE_SERVER}));
1550 add_opt(common_arg(
1551 {"--samplers"}, "SAMPLERS",
1552 string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
1553 [](common_params & params, const std::string & value) {
1554 const auto sampler_names = string_split<std::string>(value, ';');
1555 params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
1556 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
1557 }
1558 ).set_sparam());
1559 add_opt(common_arg(
1560 {"-s", "--seed"}, "SEED",
1561 string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
1562 [](common_params & params, const std::string & value) {
1563 params.sampling.seed = std::stoul(value);
1564 }
1565 ).set_sparam());
1566 add_opt(common_arg(
1567 {"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
1568 string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
1569 [](common_params & params, const std::string & value) {
1570 params.sampling.samplers = common_sampler_types_from_chars(value);
1571 }
1572 ).set_sparam());
1573 add_opt(common_arg(
1574 {"--ignore-eos"},
1575 "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
1576 [](common_params & params) {
1577 params.sampling.ignore_eos = true;
1578 }
1579 ).set_sparam());
1580 add_opt(common_arg(
1581 {"--temp"}, "N",
1582 string_format("temperature (default: %.2f)", (double)params.sampling.temp),
1583 [](common_params & params, const std::string & value) {
1584 params.sampling.temp = std::stof(value);
1585 params.sampling.temp = std::max(params.sampling.temp, 0.0f);
1586 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
1587 }
1588 ).set_sparam());
1589 add_opt(common_arg(
1590 {"--top-k"}, "N",
1591 string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
1592 [](common_params & params, int value) {
1593 params.sampling.top_k = value;
1594 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
1595 }
1596 ).set_sparam().set_env("LLAMA_ARG_TOP_K"));
1597 add_opt(common_arg(
1598 {"--top-p"}, "N",
1599 string_format("top-p sampling (default: %.2f, 1.0 = disabled)", (double)params.sampling.top_p),
1600 [](common_params & params, const std::string & value) {
1601 params.sampling.top_p = std::stof(value);
1602 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
1603 }
1604 ).set_sparam());
1605 add_opt(common_arg(
1606 {"--min-p"}, "N",
1607 string_format("min-p sampling (default: %.2f, 0.0 = disabled)", (double)params.sampling.min_p),
1608 [](common_params & params, const std::string & value) {
1609 params.sampling.min_p = std::stof(value);
1610 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
1611 }
1612 ).set_sparam());
1613 add_opt(common_arg(
1614 {"--top-nsigma"}, "N",
1615 string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma),
1616 [](common_params & params, const std::string & value) {
1617 params.sampling.top_n_sigma = std::stof(value);
1618 }
1619 ).set_sparam());
1620 add_opt(common_arg(
1621 {"--xtc-probability"}, "N",
1622 string_format("xtc probability (default: %.2f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
1623 [](common_params & params, const std::string & value) {
1624 params.sampling.xtc_probability = std::stof(value);
1625 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
1626 }
1627 ).set_sparam());
1628 add_opt(common_arg(
1629 {"--xtc-threshold"}, "N",
1630 string_format("xtc threshold (default: %.2f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
1631 [](common_params & params, const std::string & value) {
1632 params.sampling.xtc_threshold = std::stof(value);
1633 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
1634 }
1635 ).set_sparam());
1636 add_opt(common_arg(
1637 {"--typical"}, "N",
1638 string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p),
1639 [](common_params & params, const std::string & value) {
1640 params.sampling.typ_p = std::stof(value);
1641 }
1642 ).set_sparam());
1643 add_opt(common_arg(
1644 {"--repeat-last-n"}, "N",
1645 string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
1646 [](common_params & params, int value) {
1647 if (value < -1) {
1648 throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
1649 }
1650 params.sampling.penalty_last_n = value;
1651 params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
1652 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
1653 }
1654 ).set_sparam());
1655 add_opt(common_arg(
1656 {"--repeat-penalty"}, "N",
1657 string_format("penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
1658 [](common_params & params, const std::string & value) {
1659 params.sampling.penalty_repeat = std::stof(value);
1660 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
1661 }
1662 ).set_sparam());
1663 add_opt(common_arg(
1664 {"--presence-penalty"}, "N",
1665 string_format("repeat alpha presence penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_present),
1666 [](common_params & params, const std::string & value) {
1667 params.sampling.penalty_present = std::stof(value);
1668 }
1669 ).set_sparam());
1670 add_opt(common_arg(
1671 {"--frequency-penalty"}, "N",
1672 string_format("repeat alpha frequency penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
1673 [](common_params & params, const std::string & value) {
1674 params.sampling.penalty_freq = std::stof(value);
1675 }
1676 ).set_sparam());
1677 add_opt(common_arg(
1678 {"--dry-multiplier"}, "N",
1679 string_format("set DRY sampling multiplier (default: %.2f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
1680 [](common_params & params, const std::string & value) {
1681 params.sampling.dry_multiplier = std::stof(value);
1682 }
1683 ).set_sparam());
1684 add_opt(common_arg(
1685 {"--dry-base"}, "N",
1686 string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
1687 [](common_params & params, const std::string & value) {
1688 float potential_base = std::stof(value);
1689 if (potential_base >= 1.0f)
1690 {
1691 params.sampling.dry_base = potential_base;
1692 }
1693 }
1694 ).set_sparam());
1695 add_opt(common_arg(
1696 {"--dry-allowed-length"}, "N",
1697 string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
1698 [](common_params & params, int value) {
1699 params.sampling.dry_allowed_length = value;
1700 }
1701 ).set_sparam());
1702 add_opt(common_arg(
1703 {"--dry-penalty-last-n"}, "N",
1704 string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
1705 [](common_params & params, int value) {
1706 if (value < -1) {
1707 throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
1708 }
1709 params.sampling.dry_penalty_last_n = value;
1710 }
1711 ).set_sparam());
1712 add_opt(common_arg(
1713 {"--dry-sequence-breaker"}, "STRING",
1714 string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
1715 params.sampling.dry_sequence_breakers.empty() ? "none" :
1716 std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
1717 params.sampling.dry_sequence_breakers.end(),
1718 std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
1719 [](const std::string& a, const std::string& b) {
1720 std::string formatted_b = (b == "\n") ? "\\n" : b;
1721 return a + ", '" + formatted_b + "'";
1722 }).c_str()),
1723 [](common_params & params, const std::string & value) {
1724 static bool defaults_cleared = false;
1725
1726 if (!defaults_cleared) {
1727 params.sampling.dry_sequence_breakers.clear();
1728 defaults_cleared = true;
1729 }
1730
1731 if (value == "none") {
1732 params.sampling.dry_sequence_breakers.clear();
1733 } else {
1734 params.sampling.dry_sequence_breakers.emplace_back(value);
1735 }
1736 }
1737 ).set_sparam());
1738 add_opt(common_arg(
1739 {"--adaptive-target"}, "N",
1740 string_format("adaptive-p: select tokens near this probability (valid range 0.0 "
1741 "to 1.0; negative = disabled) (default: %.2f)\n"
1742 "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)",
1743 (double)params.sampling.adaptive_target),
1744 [](common_params & params, const std::string & value) {
1745 params.sampling.adaptive_target = std::stof(value);
1746 }
1747 ).set_sparam());
1748 add_opt(common_arg(
1749 {"--adaptive-decay"}, "N",
1750 string_format("adaptive-p: decay rate for target adaptation over time. lower values "
1751 "are more reactive, higher values are more stable.\n"
1752 "(valid range 0.0 to 0.99) (default: %.2f)",
1753 (double)params.sampling.adaptive_decay),
1754 [](common_params & params, const std::string & value) {
1755 params.sampling.adaptive_decay = std::stof(value);
1756 }
1757 ).set_sparam());
1758 add_opt(common_arg(
1759 {"--dynatemp-range"}, "N",
1760 string_format("dynamic temperature range (default: %.2f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
1761 [](common_params & params, const std::string & value) {
1762 params.sampling.dynatemp_range = std::stof(value);
1763 }
1764 ).set_sparam());
1765 add_opt(common_arg(
1766 {"--dynatemp-exp"}, "N",
1767 string_format("dynamic temperature exponent (default: %.2f)", (double)params.sampling.dynatemp_exponent),
1768 [](common_params & params, const std::string & value) {
1769 params.sampling.dynatemp_exponent = std::stof(value);
1770 }
1771 ).set_sparam());
1772 add_opt(common_arg(
1773 {"--mirostat"}, "N",
1774 string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
1775 "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
1776 [](common_params & params, int value) {
1777 params.sampling.mirostat = value;
1778 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
1779 }
1780 ).set_sparam());
1781 add_opt(common_arg(
1782 {"--mirostat-lr"}, "N",
1783 string_format("Mirostat learning rate, parameter eta (default: %.2f)", (double)params.sampling.mirostat_eta),
1784 [](common_params & params, const std::string & value) {
1785 params.sampling.mirostat_eta = std::stof(value);
1786 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
1787 }
1788 ).set_sparam());
1789 add_opt(common_arg(
1790 {"--mirostat-ent"}, "N",
1791 string_format("Mirostat target entropy, parameter tau (default: %.2f)", (double)params.sampling.mirostat_tau),
1792 [](common_params & params, const std::string & value) {
1793 params.sampling.mirostat_tau = std::stof(value);
1794 params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
1795 }
1796 ).set_sparam());
1797 add_opt(common_arg(
1798 {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
1799 "modifies the likelihood of token appearing in the completion,\n"
1800 "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
1801 "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
1802 [](common_params & params, const std::string & value) {
1803 std::stringstream ss(value);
1804 llama_token key;
1805 char sign;
1806 std::string value_str;
1807 try {
1808 if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
1809 const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
1810 params.sampling.logit_bias.push_back({key, bias});
1811 } else {
1812 throw std::invalid_argument("invalid input format");
1813 }
1814 } catch (const std::exception&) {
1815 throw std::invalid_argument("invalid input format");
1816 }
1817 }
1818 ).set_sparam());
1819 add_opt(common_arg(
1820 {"--grammar"}, "GRAMMAR",
1821 string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
1822 [](common_params & params, const std::string & value) {
1823 params.sampling.grammar = value;
1824 }
1825 ).set_sparam());
1826 add_opt(common_arg(
1827 {"--grammar-file"}, "FNAME",
1828 "file to read grammar from",
1829 [](common_params & params, const std::string & value) {
1830 params.sampling.grammar = read_file(value);
1831 }
1832 ).set_sparam());
1833 add_opt(common_arg(
1834 {"-j", "--json-schema"}, "SCHEMA",
1835 "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1836 [](common_params & params, const std::string & value) {
1837 params.sampling.grammar = json_schema_to_grammar(json::parse(value));
1838 }
1839 ).set_sparam());
1840 add_opt(common_arg(
1841 {"-jf", "--json-schema-file"}, "FILE",
1842 "File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1843 [](common_params & params, const std::string & value) {
1844 std::ifstream file(value);
1845 if (!file) {
1846 throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1847 }
1848 std::string schema;
1849 std::copy(
1850 std::istreambuf_iterator<char>(file),
1851 std::istreambuf_iterator<char>(),
1852 std::back_inserter(schema)
1853 );
1854 params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
1855 }
1856 ).set_sparam());
1857 add_opt(common_arg(
1858 {"-bs", "--backend-sampling"},
1859 "enable backend sampling (experimental) (default: disabled)",
1860 [](common_params & params) {
1861 params.sampling.backend_sampling = true;
1862 }
1863 ).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
1864 add_opt(common_arg(
1865 {"--pooling"}, "{none,mean,cls,last,rank}",
1866 "pooling type for embeddings, use model default if unspecified",
1867 [](common_params & params, const std::string & value) {
1868 /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
1869 else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
1870 else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
1871 else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
1872 else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
1873 else { throw std::invalid_argument("invalid value"); }
1874 }
1875 ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
1876 add_opt(common_arg(
1877 {"--attention"}, "{causal,non-causal}",
1878 "attention type for embeddings, use model default if unspecified",
1879 [](common_params & params, const std::string & value) {
1880 /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
1881 else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
1882 else { throw std::invalid_argument("invalid value"); }
1883 }
1884 ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1885 add_opt(common_arg(
1886 {"--rope-scaling"}, "{none,linear,yarn}",
1887 "RoPE frequency scaling method, defaults to linear unless specified by the model",
1888 [](common_params & params, const std::string & value) {
1889 /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
1890 else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
1891 else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
1892 else { throw std::invalid_argument("invalid value"); }
1893 }
1894 ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
1895 add_opt(common_arg(
1896 {"--rope-scale"}, "N",
1897 "RoPE context scaling factor, expands context by a factor of N",
1898 [](common_params & params, const std::string & value) {
1899 params.rope_freq_scale = 1.0f / std::stof(value);
1900 }
1901 ).set_env("LLAMA_ARG_ROPE_SCALE"));
1902 add_opt(common_arg(
1903 {"--rope-freq-base"}, "N",
1904 "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
1905 [](common_params & params, const std::string & value) {
1906 params.rope_freq_base = std::stof(value);
1907 }
1908 ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
1909 add_opt(common_arg(
1910 {"--rope-freq-scale"}, "N",
1911 "RoPE frequency scaling factor, expands context by a factor of 1/N",
1912 [](common_params & params, const std::string & value) {
1913 params.rope_freq_scale = std::stof(value);
1914 }
1915 ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
1916 add_opt(common_arg(
1917 {"--yarn-orig-ctx"}, "N",
1918 string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
1919 [](common_params & params, int value) {
1920 params.yarn_orig_ctx = value;
1921 }
1922 ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
1923 add_opt(common_arg(
1924 {"--yarn-ext-factor"}, "N",
1925 string_format("YaRN: extrapolation mix factor (default: %.2f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
1926 [](common_params & params, const std::string & value) {
1927 params.yarn_ext_factor = std::stof(value);
1928 }
1929 ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
1930 add_opt(common_arg(
1931 {"--yarn-attn-factor"}, "N",
1932 string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.2f)", (double)params.yarn_attn_factor),
1933 [](common_params & params, const std::string & value) {
1934 params.yarn_attn_factor = std::stof(value);
1935 }
1936 ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
1937 add_opt(common_arg(
1938 {"--yarn-beta-slow"}, "N",
1939 string_format("YaRN: high correction dim or alpha (default: %.2f)", (double)params.yarn_beta_slow),
1940 [](common_params & params, const std::string & value) {
1941 params.yarn_beta_slow = std::stof(value);
1942 }
1943 ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
1944 add_opt(common_arg(
1945 {"--yarn-beta-fast"}, "N",
1946 string_format("YaRN: low correction dim or beta (default: %.2f)", (double)params.yarn_beta_fast),
1947 [](common_params & params, const std::string & value) {
1948 params.yarn_beta_fast = std::stof(value);
1949 }
1950 ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
1951 add_opt(common_arg(
1952 {"-gan", "--grp-attn-n"}, "N",
1953 string_format("group-attention factor (default: %d)", params.grp_attn_n),
1954 [](common_params & params, int value) {
1955 params.grp_attn_n = value;
1956 }
1957 ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
1958 add_opt(common_arg(
1959 {"-gaw", "--grp-attn-w"}, "N",
1960 string_format("group-attention width (default: %d)", params.grp_attn_w),
1961 [](common_params & params, int value) {
1962 params.grp_attn_w = value;
1963 }
1964 ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
1965 add_opt(common_arg(
1966 {"-kvo", "--kv-offload"},
1967 {"-nkvo", "--no-kv-offload"},
1968 string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
1969 [](common_params & params, bool value) {
1970 params.no_kv_offload = !value;
1971 }
1972 ).set_env("LLAMA_ARG_KV_OFFLOAD"));
1973 add_opt(common_arg(
1974 {"--repack"},
1975 {"-nr", "--no-repack"},
1976 string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
1977 [](common_params & params, bool value) {
1978 params.no_extra_bufts = !value;
1979 }
1980 ).set_env("LLAMA_ARG_REPACK"));
1981 add_opt(common_arg(
1982 {"--no-host"},
1983 "bypass host buffer allowing extra buffers to be used",
1984 [](common_params & params) {
1985 params.no_host = true;
1986 }
1987 ).set_env("LLAMA_ARG_NO_HOST"));
1988 add_opt(common_arg(
1989 {"-ctk", "--cache-type-k"}, "TYPE",
1990 string_format(
1991 "KV cache data type for K\n"
1992 "allowed values: %s\n"
1993 "(default: %s)",
1994 get_all_kv_cache_types().c_str(),
1995 ggml_type_name(params.cache_type_k)
1996 ),
1997 [](common_params & params, const std::string & value) {
1998 params.cache_type_k = kv_cache_type_from_str(value);
1999 }
2000 ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
2001 add_opt(common_arg(
2002 {"-ctv", "--cache-type-v"}, "TYPE",
2003 string_format(
2004 "KV cache data type for V\n"
2005 "allowed values: %s\n"
2006 "(default: %s)",
2007 get_all_kv_cache_types().c_str(),
2008 ggml_type_name(params.cache_type_v)
2009 ),
2010 [](common_params & params, const std::string & value) {
2011 params.cache_type_v = kv_cache_type_from_str(value);
2012 }
2013 ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
2014 add_opt(common_arg(
2015 {"--hellaswag"},
2016 "compute HellaSwag score over random tasks from datafile supplied with -f",
2017 [](common_params & params) {
2018 params.hellaswag = true;
2019 }
2020 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2021 add_opt(common_arg(
2022 {"--hellaswag-tasks"}, "N",
2023 string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
2024 [](common_params & params, int value) {
2025 params.hellaswag_tasks = value;
2026 }
2027 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2028 add_opt(common_arg(
2029 {"--winogrande"},
2030 "compute Winogrande score over random tasks from datafile supplied with -f",
2031 [](common_params & params) {
2032 params.winogrande = true;
2033 }
2034 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2035 add_opt(common_arg(
2036 {"--winogrande-tasks"}, "N",
2037 string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
2038 [](common_params & params, int value) {
2039 params.winogrande_tasks = value;
2040 }
2041 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2042 add_opt(common_arg(
2043 {"--multiple-choice"},
2044 "compute multiple choice score over random tasks from datafile supplied with -f",
2045 [](common_params & params) {
2046 params.multiple_choice = true;
2047 }
2048 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2049 add_opt(common_arg(
2050 {"--multiple-choice-tasks"}, "N",
2051 string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
2052 [](common_params & params, int value) {
2053 params.multiple_choice_tasks = value;
2054 }
2055 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2056 add_opt(common_arg(
2057 {"--kl-divergence"},
2058 "computes KL-divergence to logits provided via --kl-divergence-base",
2059 [](common_params & params) {
2060 params.kl_divergence = true;
2061 }
2062 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2063 add_opt(common_arg(
2064 {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
2065 "set logits file",
2066 [](common_params & params, const std::string & value) {
2067 params.logits_file = value;
2068 }
2069 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2070 add_opt(common_arg(
2071 {"--ppl-stride"}, "N",
2072 string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
2073 [](common_params & params, int value) {
2074 params.ppl_stride = value;
2075 }
2076 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2077 add_opt(common_arg(
2078 {"--ppl-output-type"}, "<0|1>",
2079 string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
2080 [](common_params & params, int value) {
2081 params.ppl_output_type = value;
2082 }
2083 ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2084 add_opt(common_arg(
2085 {"-dt", "--defrag-thold"}, "N",
2086 string_format("KV cache defragmentation threshold (DEPRECATED)"),
2087 [](common_params & params, const std::string & value) {
2088 GGML_UNUSED(params);
2089 GGML_UNUSED(value);
2090 LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
2091 }
2092 ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
2093 if (ex == LLAMA_EXAMPLE_SERVER) {
2094 // this is to make sure this option appears in the server-specific section of the help message
2095 add_opt(common_arg(
2096 {"-np", "--parallel"}, "N",
2097 string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
2098 [](common_params & params, int value) {
2099 if (value == 0) {
2100 throw std::invalid_argument("error: invalid value for n_parallel\n");
2101 }
2102 params.n_parallel = value;
2103 }
2104 ).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
2105 } else {
2106 add_opt(common_arg(
2107 {"-np", "--parallel"}, "N",
2108 string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
2109 [](common_params & params, int value) {
2110 params.n_parallel = value;
2111 }
2112 ).set_env("LLAMA_ARG_N_PARALLEL"));
2113 }
2114 add_opt(common_arg(
2115 {"-ns", "--sequences"}, "N",
2116 string_format("number of sequences to decode (default: %d)", params.n_sequences),
2117 [](common_params & params, int value) {
2118 params.n_sequences = value;
2119 }
2120 ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
2121 add_opt(common_arg(
2122 {"-cb", "--cont-batching"},
2123 {"-nocb", "--no-cont-batching"},
2124 string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
2125 [](common_params & params, bool value) {
2126 params.cont_batching = value;
2127 }
2128 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
2129 add_opt(common_arg(
2130 {"-mm", "--mmproj"}, "FILE",
2131 "path to a multimodal projector file. see tools/mtmd/README.md\n"
2132 "note: if -hf is used, this argument can be omitted",
2133 [](common_params & params, const std::string & value) {
2134 params.mmproj.path = value;
2135 }
2136 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
2137 add_opt(common_arg(
2138 {"-mmu", "--mmproj-url"}, "URL",
2139 "URL to a multimodal projector file. see tools/mtmd/README.md",
2140 [](common_params & params, const std::string & value) {
2141 params.mmproj.url = value;
2142 }
2143 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
2144 add_opt(common_arg(
2145 {"--mmproj-auto"},
2146 {"--no-mmproj", "--no-mmproj-auto"},
2147 string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
2148 [](common_params & params, bool value) {
2149 params.no_mmproj = !value;
2150 }
2151 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
2152 add_opt(common_arg(
2153 {"--mmproj-offload"},
2154 {"--no-mmproj-offload"},
2155 string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
2156 [](common_params & params, bool value) {
2157 params.mmproj_use_gpu = value;
2158 }
2159 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
2160 add_opt(common_arg(
2161 {"--image", "--audio"}, "FILE",
2162 "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
2163 [](common_params & params, const std::string & value) {
2164 for (const auto & item : parse_csv_row(value)) {
2165 params.image.emplace_back(item);
2166 }
2167 }
2168 ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
2169 add_opt(common_arg(
2170 {"--image-min-tokens"}, "N",
2171 "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
2172 [](common_params & params, int value) {
2173 params.image_min_tokens = value;
2174 }
2175 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
2176 add_opt(common_arg(
2177 {"--image-max-tokens"}, "N",
2178 "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
2179 [](common_params & params, int value) {
2180 params.image_max_tokens = value;
2181 }
2182 ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
2183 if (llama_supports_rpc()) {
2184 add_opt(common_arg(
2185 {"--rpc"}, "SERVERS",
2186 "comma separated list of RPC servers (host:port)",
2187 [](common_params & params, const std::string & value) {
2188 add_rpc_devices(value);
2189 GGML_UNUSED(params);
2190 }
2191 ).set_env("LLAMA_ARG_RPC"));
2192 }
2193 add_opt(common_arg(
2194 {"--mlock"},
2195 "force system to keep model in RAM rather than swapping or compressing",
2196 [](common_params & params) {
2197 params.use_mlock = true;
2198 }
2199 ).set_env("LLAMA_ARG_MLOCK"));
2200 add_opt(common_arg(
2201 {"--mmap"},
2202 {"--no-mmap"},
2203 string_format("whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
2204 [](common_params & params, bool value) {
2205 params.use_mmap = value;
2206 }
2207 ).set_env("LLAMA_ARG_MMAP"));
2208 add_opt(common_arg(
2209 {"-dio", "--direct-io"},
2210 {"-ndio", "--no-direct-io"},
2211 string_format("use DirectIO if available. (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
2212 [](common_params & params, bool value) {
2213 params.use_direct_io = value;
2214 }
2215 ).set_env("LLAMA_ARG_DIO"));
2216 add_opt(common_arg(
2217 {"--numa"}, "TYPE",
2218 "attempt optimizations that help on some NUMA systems\n"
2219 "- distribute: spread execution evenly over all nodes\n"
2220 "- isolate: only spawn threads on CPUs on the node that execution started on\n"
2221 "- numactl: use the CPU map provided by numactl\n"
2222 "if run without this previously, it is recommended to drop the system page cache before using this\n"
2223 "see https://github.com/ggml-org/llama.cpp/issues/1437",
2224 [](common_params & params, const std::string & value) {
2225 /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
2226 else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
2227 else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
2228 else { throw std::invalid_argument("invalid value"); }
2229 }
2230 ).set_env("LLAMA_ARG_NUMA"));
2231 add_opt(common_arg(
2232 {"-dev", "--device"}, "<dev1,dev2,..>",
2233 "comma-separated list of devices to use for offloading (none = don't offload)\n"
2234 "use --list-devices to see a list of available devices",
2235 [](common_params & params, const std::string & value) {
2236 params.devices = parse_device_list(value);
2237 }
2238 ).set_env("LLAMA_ARG_DEVICE"));
2239 add_opt(common_arg(
2240 {"--list-devices"},
2241 "print list of available devices and exit",
2242 [](common_params &) {
2243 std::vector<ggml_backend_dev_t> devices;
2244 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
2245 auto * dev = ggml_backend_dev_get(i);
2246 if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
2247 devices.push_back(dev);
2248 }
2249 }
2250 printf("Available devices:\n");
2251 for (auto * dev : devices) {
2252 size_t free, total;
2253 ggml_backend_dev_memory(dev, &free, &total);
2254 printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
2255 }
2256 exit(0);
2257 }
2258 ));
2259 add_opt(common_arg(
2260 {"-ot", "--override-tensor"}, "<tensor name pattern>=<buffer type>,...",
2261 "override tensor buffer type", [](common_params & params, const std::string & value) {
2262 parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
2263 }
2264 ).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
2265 add_opt(common_arg(
2266 {"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
2267 "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
2268 parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
2269 }
2270 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
2271 add_opt(common_arg(
2272 {"-cmoe", "--cpu-moe"},
2273 "keep all Mixture of Experts (MoE) weights in the CPU",
2274 [](common_params & params) {
2275 params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2276 }
2277 ).set_env("LLAMA_ARG_CPU_MOE"));
2278 add_opt(common_arg(
2279 {"-ncmoe", "--n-cpu-moe"}, "N",
2280 "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2281 [](common_params & params, int value) {
2282 if (value < 0) {
2283 throw std::invalid_argument("invalid value");
2284 }
2285 for (int i = 0; i < value; ++i) {
2286 // keep strings alive and avoid leaking memory by storing them in a static vector
2287 static std::list<std::string> buft_overrides;
2288 buft_overrides.push_back(llm_ffn_exps_block_regex(i));
2289 params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2290 }
2291 }
2292 ).set_env("LLAMA_ARG_N_CPU_MOE"));
2293 add_opt(common_arg(
2294 {"-cmoed", "--cpu-moe-draft"},
2295 "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
2296 [](common_params & params) {
2297 params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2298 }
2299 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2300 add_opt(common_arg(
2301 {"-ncmoed", "--n-cpu-moe-draft"}, "N",
2302 "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
2303 [](common_params & params, int value) {
2304 if (value < 0) {
2305 throw std::invalid_argument("invalid value");
2306 }
2307 for (int i = 0; i < value; ++i) {
2308 static std::list<std::string> buft_overrides_draft;
2309 buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
2310 params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
2311 }
2312 }
2313 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
2314 GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
2315 add_opt(common_arg(
2316 {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2317 string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
2318 [](common_params & params, const std::string & value) {
2319 if (value == "auto") {
2320 params.n_gpu_layers = -1;
2321 } else if (value == "all") {
2322 params.n_gpu_layers = -2;
2323 } else {
2324 params.n_gpu_layers = std::stoi(value);
2325 }
2326 if (!llama_supports_gpu_offload()) {
2327 fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
2328 fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
2329 fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
2330 }
2331 }
2332 ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
2333 add_opt(common_arg(
2334 {"-sm", "--split-mode"}, "{none,layer,row}",
2335 "how to split the model across multiple GPUs, one of:\n"
2336 "- none: use one GPU only\n"
2337 "- layer (default): split layers and KV across GPUs\n"
2338 "- row: split rows across GPUs",
2339 [](common_params & params, const std::string & value) {
2340 std::string arg_next = value;
2341 if (arg_next == "none") {
2342 params.split_mode = LLAMA_SPLIT_MODE_NONE;
2343 } else if (arg_next == "layer") {
2344 params.split_mode = LLAMA_SPLIT_MODE_LAYER;
2345 } else if (arg_next == "row") {
2346 params.split_mode = LLAMA_SPLIT_MODE_ROW;
2347 } else {
2348 throw std::invalid_argument("invalid value");
2349 }
2350 if (!llama_supports_gpu_offload()) {
2351 fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
2352 }
2353 }
2354 ).set_env("LLAMA_ARG_SPLIT_MODE"));
2355 add_opt(common_arg(
2356 {"-ts", "--tensor-split"}, "N0,N1,N2,...",
2357 "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
2358 [](common_params & params, const std::string & value) {
2359 std::string arg_next = value;
2360
2361 // split string by , and /
2362 const std::regex regex{ R"([,/]+)" };
2363 std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
2364 std::vector<std::string> split_arg{ it, {} };
2365 if (split_arg.size() >= llama_max_devices()) {
2366 throw std::invalid_argument(
2367 string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
2368 );
2369 }
2370 for (size_t i = 0; i < llama_max_devices(); ++i) {
2371 if (i < split_arg.size()) {
2372 params.tensor_split[i] = std::stof(split_arg[i]);
2373 } else {
2374 params.tensor_split[i] = 0.0f;
2375 }
2376 }
2377 if (!llama_supports_gpu_offload()) {
2378 fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
2379 }
2380 }
2381 ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
2382 add_opt(common_arg(
2383 {"-mg", "--main-gpu"}, "INDEX",
2384 string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
2385 [](common_params & params, int value) {
2386 params.main_gpu = value;
2387 if (!llama_supports_gpu_offload()) {
2388 fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
2389 }
2390 }
2391 ).set_env("LLAMA_ARG_MAIN_GPU"));
2392 add_opt(common_arg(
2393 { "-fit", "--fit" }, "[on|off]",
2394 string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
2395 [](common_params & params, const std::string & value) {
2396 if (is_truthy(value)) {
2397 params.fit_params = true;
2398 } else if (is_falsey(value)) {
2399 params.fit_params = false;
2400 } else {
2401 throw std::runtime_error(
2402 string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
2403 }
2404 }
2405 ).set_env("LLAMA_ARG_FIT"));
2406 add_opt(common_arg(
2407 { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
2408 string_format("target margin per device for --fit, comma-separated list of values, "
2409 "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
2410 [](common_params & params, const std::string & value) {
2411 std::string arg_next = value;
2412
2413 // split string by , and /
2414 const std::regex regex{ R"([,/]+)" };
2415 std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
2416 std::vector<std::string> split_arg{ it, {} };
2417 if (split_arg.size() >= llama_max_devices()) {
2418 throw std::invalid_argument(
2419 string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
2420 );
2421 }
2422 if (split_arg.size() == 1) {
2423 std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
2424 return;
2425 }
2426 for (size_t i = 0; i < split_arg.size(); i++) {
2427 params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
2428 }
2429 }
2430 ).set_env("LLAMA_ARG_FIT_TARGET"));
2431 add_opt(common_arg(
2432 { "-fitc", "--fit-ctx" }, "N",
2433 string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
2434 [](common_params & params, int value) {
2435 params.fit_params_min_ctx = value;
2436 }
2437 ).set_env("LLAMA_ARG_FIT_CTX"));
2438 add_opt(common_arg(
2439 {"--check-tensors"},
2440 string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
2441 [](common_params & params) {
2442 params.check_tensors = true;
2443 }
2444 ));
2445 add_opt(common_arg(
2446 {"--override-kv"}, "KEY=TYPE:VALUE,...",
2447 "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
2448 "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
2449 [](common_params & params, const std::string & value) {
2450 for (const auto & item : parse_csv_row(value)) {
2451 if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
2452 throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
2453 }
2454 }
2455 }
2456 ));
2457 add_opt(common_arg(
2458 {"--op-offload"},
2459 {"--no-op-offload"},
2460 string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
2461 [](common_params & params, bool value) {
2462 params.no_op_offload = !value;
2463 }
2464 ));
2465 add_opt(common_arg(
2466 {"--lora"}, "FNAME",
2467 "path to LoRA adapter (use comma-separated values to load multiple adapters)",
2468 [](common_params & params, const std::string & value) {
2469 for (const auto & item : parse_csv_row(value)) {
2470 params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
2471 }
2472 }
2473 // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2474 ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2475 add_opt(common_arg(
2476 {"--lora-scaled"}, "FNAME:SCALE,...",
2477 "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
2478 "note: use comma-separated values",
2479 [](common_params & params, const std::string & value) {
2480 for (const auto & item : parse_csv_row(value)) {
2481 auto parts = string_split<std::string>(item, ':');
2482 if (parts.size() != 2) {
2483 throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
2484 }
2485 params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
2486 }
2487 }
2488 // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2489 ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2490 add_opt(common_arg(
2491 {"--control-vector"}, "FNAME",
2492 "add a control vector\nnote: use comma-separated values to add multiple control vectors",
2493 [](common_params & params, const std::string & value) {
2494 for (const auto & item : parse_csv_row(value)) {
2495 params.control_vectors.push_back({ 1.0f, item, });
2496 }
2497 }
2498 ));
2499 add_opt(common_arg(
2500 {"--control-vector-scaled"}, "FNAME:SCALE,...",
2501 "add a control vector with user defined scaling SCALE\n"
2502 "note: use comma-separated values (format: FNAME:SCALE,...)",
2503 [](common_params & params, const std::string & value) {
2504 for (const auto & item : parse_csv_row(value)) {
2505 auto parts = string_split<std::string>(item, ':');
2506 if (parts.size() != 2) {
2507 throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
2508 }
2509 params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
2510 }
2511 }
2512 ));
2513 add_opt(common_arg(
2514 {"--control-vector-layer-range"}, "START", "END",
2515 "layer range to apply the control vector(s) to, start and end inclusive",
2516 [](common_params & params, const std::string & start, const std::string & end) {
2517 params.control_vector_layer_start = std::stoi(start);
2518 params.control_vector_layer_end = std::stoi(end);
2519 }
2520 ));
2521 add_opt(common_arg(
2522 {"-a", "--alias"}, "STRING",
2523 "set alias for model name (to be used by REST API)",
2524 [](common_params & params, const std::string & value) {
2525 params.model_alias = value;
2526 }
2527 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
2528 add_opt(common_arg(
2529 {"-m", "--model"}, "FNAME",
2530 ex == LLAMA_EXAMPLE_EXPORT_LORA
2531 ? "model path from which to load base model"
2532 : "model path to load",
2533 [](common_params & params, const std::string & value) {
2534 params.model.path = value;
2535 }
2536 ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
2537 add_opt(common_arg(
2538 {"-mu", "--model-url"}, "MODEL_URL",
2539 "model download url (default: unused)",
2540 [](common_params & params, const std::string & value) {
2541 params.model.url = value;
2542 }
2543 ).set_env("LLAMA_ARG_MODEL_URL"));
2544 add_opt(common_arg(
2545 { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
2546 "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
2547 "example: gemma3\n"
2548 "(default: unused)",
2549 [](common_params & params, const std::string & value) {
2550 params.model.docker_repo = value;
2551 }
2552 ).set_env("LLAMA_ARG_DOCKER_REPO"));
2553 add_opt(common_arg(
2554 {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
2555 "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
2556 "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
2557 "example: unsloth/phi-4-GGUF:q4_k_m\n"
2558 "(default: unused)",
2559 [](common_params & params, const std::string & value) {
2560 params.model.hf_repo = value;
2561 }
2562 ).set_env("LLAMA_ARG_HF_REPO"));
2563 add_opt(common_arg(
2564 {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
2565 "Same as --hf-repo, but for the draft model (default: unused)",
2566 [](common_params & params, const std::string & value) {
2567 params.speculative.mparams_dft.hf_repo = value;
2568 }
2569 ).set_env("LLAMA_ARG_HFD_REPO"));
2570 add_opt(common_arg(
2571 {"-hff", "--hf-file"}, "FILE",
2572 "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
2573 [](common_params & params, const std::string & value) {
2574 params.model.hf_file = value;
2575 }
2576 ).set_env("LLAMA_ARG_HF_FILE"));
2577 add_opt(common_arg(
2578 {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
2579 "Hugging Face model repository for the vocoder model (default: unused)",
2580 [](common_params & params, const std::string & value) {
2581 params.vocoder.model.hf_repo = value;
2582 }
2583 ).set_env("LLAMA_ARG_HF_REPO_V"));
2584 add_opt(common_arg(
2585 {"-hffv", "--hf-file-v"}, "FILE",
2586 "Hugging Face model file for the vocoder model (default: unused)",
2587 [](common_params & params, const std::string & value) {
2588 params.vocoder.model.hf_file = value;
2589 }
2590 ).set_env("LLAMA_ARG_HF_FILE_V"));
2591 add_opt(common_arg(
2592 {"-hft", "--hf-token"}, "TOKEN",
2593 "Hugging Face access token (default: value from HF_TOKEN environment variable)",
2594 [](common_params & params, const std::string & value) {
2595 params.hf_token = value;
2596 }
2597 ).set_env("HF_TOKEN"));
2598 add_opt(common_arg(
2599 {"--context-file"}, "FNAME",
2600 "file to load context from (use comma-separated values to specify multiple files)",
2601 [](common_params & params, const std::string & value) {
2602 for (const auto & item : parse_csv_row(value)) {
2603 std::ifstream file(item, std::ios::binary);
2604 if (!file) {
2605 throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
2606 }
2607 params.context_files.push_back(item);
2608 }
2609 }
2610 ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2611 add_opt(common_arg(
2612 {"--chunk-size"}, "N",
2613 string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
2614 [](common_params & params, int value) {
2615 params.chunk_size = value;
2616 }
2617 ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2618 add_opt(common_arg(
2619 {"--chunk-separator"}, "STRING",
2620 string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
2621 [](common_params & params, const std::string & value) {
2622 params.chunk_separator = value;
2623 }
2624 ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2625 add_opt(common_arg(
2626 {"--junk"}, "N",
2627 string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
2628 [](common_params & params, int value) {
2629 params.n_junk = value;
2630 }
2631 ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
2632 add_opt(common_arg(
2633 {"--pos"}, "N",
2634 string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
2635 [](common_params & params, int value) {
2636 params.i_pos = value;
2637 }
2638 ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
2639 add_opt(common_arg(
2640 {"-o", "--output", "--output-file"}, "FNAME",
2641 string_format("output file (default: '%s')", params.out_file.c_str()),
2642 [](common_params & params, const std::string & value) {
2643 params.out_file = value;
2644 }
2645 ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
2646 add_opt(common_arg(
2647 {"-ofreq", "--output-frequency"}, "N",
2648 string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
2649 [](common_params & params, int value) {
2650 params.n_out_freq = value;
2651 }
2652 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2653 add_opt(common_arg(
2654 {"--output-format"}, "{gguf,dat}",
2655 string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
2656 [](common_params & params, const std::string & value) {
2657 /**/ if (value == "gguf") { params.imat_dat = -1; }
2658 else if (value == "dat") { params.imat_dat = 1; }
2659 else { throw std::invalid_argument("invalid output format"); }
2660 }
2661 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2662 add_opt(common_arg(
2663 {"--save-frequency"}, "N",
2664 string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
2665 [](common_params & params, int value) {
2666 params.n_save_freq = value;
2667 }
2668 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2669 add_opt(common_arg(
2670 {"--process-output"},
2671 string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
2672 [](common_params & params) {
2673 params.process_output = true;
2674 }
2675 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2676 add_opt(common_arg(
2677 {"--ppl"},
2678 {"--no-ppl"},
2679 string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
2680 [](common_params & params, bool value) {
2681 params.compute_ppl = value;
2682 }
2683 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2684 add_opt(common_arg(
2685 {"--chunk", "--from-chunk"}, "N",
2686 string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
2687 [](common_params & params, int value) {
2688 params.i_chunk = value;
2689 }
2690 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2691 add_opt(common_arg(
2692 {"--show-statistics"},
2693 string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
2694 [](common_params & params) {
2695 params.show_statistics = true;
2696 }
2697 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2698 add_opt(common_arg(
2699 {"--parse-special"},
2700 string_format("parse special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2701 [](common_params & params) {
2702 params.parse_special = true;
2703 }
2704 ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2705 add_opt(common_arg(
2706 {"-pps"},
2707 string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
2708 [](common_params & params) {
2709 params.is_pp_shared = true;
2710 }
2711 ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2712 add_opt(common_arg(
2713 {"-tgs"},
2714 string_format("is the text generation separated across the different sequences (default: %s)", params.is_tg_separate ? "true" : "false"),
2715 [](common_params & params) {
2716 params.is_tg_separate = true;
2717 }
2718 ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2719 add_opt(common_arg(
2720 {"-npp"}, "n0,n1,...",
2721 "number of prompt tokens",
2722 [](common_params & params, const std::string & value) {
2723 auto p = string_split<int>(value, ',');
2724 params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
2725 }
2726 ).set_examples({LLAMA_EXAMPLE_BENCH}));
2727 add_opt(common_arg(
2728 {"-ntg"}, "n0,n1,...",
2729 "number of text generation tokens",
2730 [](common_params & params, const std::string & value) {
2731 auto p = string_split<int>(value, ',');
2732 params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
2733 }
2734 ).set_examples({LLAMA_EXAMPLE_BENCH}));
2735 add_opt(common_arg(
2736 {"-npl"}, "n0,n1,...",
2737 "number of parallel prompts",
2738 [](common_params & params, const std::string & value) {
2739 auto p = string_split<int>(value, ',');
2740 params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
2741 }
2742 ).set_examples({LLAMA_EXAMPLE_BENCH}));
2743 add_opt(common_arg(
2744 {"--embd-normalize"}, "N",
2745 string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
2746 [](common_params & params, int value) {
2747 params.embd_normalize = value;
2748 }
2749 ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
2750 add_opt(common_arg(
2751 {"--embd-output-format"}, "FORMAT",
2752 "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
2753 [](common_params & params, const std::string & value) {
2754 params.embd_out = value;
2755 }
2756 ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2757 add_opt(common_arg(
2758 {"--embd-separator"}, "STRING",
2759 "separator of embeddings (default \\n) for example \"<#sep#>\"",
2760 [](common_params & params, const std::string & value) {
2761 params.embd_sep = value;
2762 }
2763 ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2764 add_opt(common_arg(
2765 {"--cls-separator"}, "STRING",
2766 "separator of classification sequences (default \\t) for example \"<#seq#>\"",
2767 [](common_params & params, const std::string & value) {
2768 params.cls_sep = value;
2769 }
2770 ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2771 add_opt(common_arg(
2772 {"--host"}, "HOST",
2773 string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
2774 [](common_params & params, const std::string & value) {
2775 params.hostname = value;
2776 }
2777 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
2778 add_opt(common_arg(
2779 {"--port"}, "PORT",
2780 string_format("port to listen (default: %d)", params.port),
2781 [](common_params & params, int value) {
2782 params.port = value;
2783 }
2784 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
2785 add_opt(common_arg(
2786 {"--path"}, "PATH",
2787 string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
2788 [](common_params & params, const std::string & value) {
2789 params.public_path = value;
2790 }
2791 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2792 add_opt(common_arg(
2793 {"--api-prefix"}, "PREFIX",
2794 string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2795 [](common_params & params, const std::string & value) {
2796 params.api_prefix = value;
2797 }
2798 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2799 add_opt(common_arg(
2800 {"--webui-config"}, "JSON",
2801 "JSON that provides default WebUI settings (overrides WebUI defaults)",
2802 [](common_params & params, const std::string & value) {
2803 params.webui_config_json = value;
2804 }
2805 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
2806 add_opt(common_arg(
2807 {"--webui-config-file"}, "PATH",
2808 "JSON file that provides default WebUI settings (overrides WebUI defaults)",
2809 [](common_params & params, const std::string & value) {
2810 params.webui_config_json = read_file(value);
2811 }
2812 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
2813 add_opt(common_arg(
2814 {"--webui"},
2815 {"--no-webui"},
2816 string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
2817 [](common_params & params, bool value) {
2818 params.webui = value;
2819 }
2820 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
2821 add_opt(common_arg(
2822 {"--embedding", "--embeddings"},
2823 string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
2824 [](common_params & params) {
2825 params.embedding = true;
2826 }
2827 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
2828 add_opt(common_arg(
2829 {"--rerank", "--reranking"},
2830 string_format("enable reranking endpoint on server (default: %s)", "disabled"),
2831 [](common_params & params) {
2832 params.embedding = true;
2833 params.pooling_type = LLAMA_POOLING_TYPE_RANK;
2834 }
2835 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
2836 add_opt(common_arg(
2837 {"--api-key"}, "KEY",
2838 "API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
2839 [](common_params & params, const std::string & value) {
2840 for (const auto & key : parse_csv_row(value)) {
2841 if (!key.empty()) {
2842 params.api_keys.push_back(key);
2843 }
2844 }
2845 }
2846 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
2847 add_opt(common_arg(
2848 {"--api-key-file"}, "FNAME",
2849 "path to file containing API keys (default: none)",
2850 [](common_params & params, const std::string & value) {
2851 std::ifstream key_file(value);
2852 if (!key_file) {
2853 throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
2854 }
2855 std::string key;
2856 while (std::getline(key_file, key)) {
2857 if (!key.empty()) {
2858 params.api_keys.push_back(key);
2859 }
2860 }
2861 key_file.close();
2862 }
2863 ).set_examples({LLAMA_EXAMPLE_SERVER}));
2864 add_opt(common_arg(
2865 {"--ssl-key-file"}, "FNAME",
2866 "path to file a PEM-encoded SSL private key",
2867 [](common_params & params, const std::string & value) {
2868 params.ssl_file_key = value;
2869 }
2870 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
2871 add_opt(common_arg(
2872 {"--ssl-cert-file"}, "FNAME",
2873 "path to file a PEM-encoded SSL certificate",
2874 [](common_params & params, const std::string & value) {
2875 params.ssl_file_cert = value;
2876 }
2877 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
2878 add_opt(common_arg(
2879 {"--chat-template-kwargs"}, "STRING",
2880 "sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
2881 [](common_params & params, const std::string & value) {
2882 auto parsed = json::parse(value);
2883 for (const auto & item : parsed.items()) {
2884 params.default_template_kwargs[item.key()] = item.value().dump();
2885 }
2886 }
2887 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
2888 add_opt(common_arg(
2889 {"-to", "--timeout"}, "N",
2890 string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
2891 [](common_params & params, int value) {
2892 params.timeout_read = value;
2893 params.timeout_write = value;
2894 }
2895 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
2896 add_opt(common_arg(
2897 {"--threads-http"}, "N",
2898 string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
2899 [](common_params & params, int value) {
2900 params.n_threads_http = value;
2901 }
2902 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
2903 add_opt(common_arg(
2904 {"--cache-prompt"},
2905 {"--no-cache-prompt"},
2906 string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
2907 [](common_params & params, bool value) {
2908 params.cache_prompt = value;
2909 }
2910 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
2911 add_opt(common_arg(
2912 {"--cache-reuse"}, "N",
2913 string_format(
2914 "min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
2915 "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
2916 ),
2917 [](common_params & params, int value) {
2918 params.n_cache_reuse = value;
2919 }
2920 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
2921 add_opt(common_arg(
2922 {"--metrics"},
2923 string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
2924 [](common_params & params) {
2925 params.endpoint_metrics = true;
2926 }
2927 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
2928 add_opt(common_arg(
2929 {"--props"},
2930 string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
2931 [](common_params & params) {
2932 params.endpoint_props = true;
2933 }
2934 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
2935 add_opt(common_arg(
2936 {"--slots"},
2937 {"--no-slots"},
2938 string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2939 [](common_params & params, bool value) {
2940 params.endpoint_slots = value;
2941 }
2942 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2943 add_opt(common_arg(
2944 {"--slot-save-path"}, "PATH",
2945 "path to save slot kv cache (default: disabled)",
2946 [](common_params & params, const std::string & value) {
2947 params.slot_save_path = value;
2948 if (!fs_is_directory(params.slot_save_path)) {
2949 throw std::invalid_argument("not a directory: " + value);
2950 }
2951 // if doesn't end with DIRECTORY_SEPARATOR, add it
2952 if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
2953 params.slot_save_path += DIRECTORY_SEPARATOR;
2954 }
2955 }
2956 ).set_examples({LLAMA_EXAMPLE_SERVER}));
2957 add_opt(common_arg(
2958 {"--media-path"}, "PATH",
2959 "directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
2960 [](common_params & params, const std::string & value) {
2961 params.media_path = value;
2962 if (!fs_is_directory(params.media_path)) {
2963 throw std::invalid_argument("not a directory: " + value);
2964 }
2965 // if doesn't end with DIRECTORY_SEPARATOR, add it
2966 if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
2967 params.media_path += DIRECTORY_SEPARATOR;
2968 }
2969 }
2970 ).set_examples({LLAMA_EXAMPLE_SERVER}));
2971 add_opt(common_arg(
2972 {"--models-dir"}, "PATH",
2973 "directory containing models for the router server (default: disabled)",
2974 [](common_params & params, const std::string & value) {
2975 params.models_dir = value;
2976 }
2977 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
2978 add_opt(common_arg(
2979 {"--models-preset"}, "PATH",
2980 "path to INI file containing model presets for the router server (default: disabled)",
2981 [](common_params & params, const std::string & value) {
2982 params.models_preset = value;
2983 }
2984 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
2985 add_opt(common_arg(
2986 {"--models-max"}, "N",
2987 string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
2988 [](common_params & params, int value) {
2989 params.models_max = value;
2990 }
2991 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
2992 add_opt(common_arg(
2993 {"--models-autoload"},
2994 {"--no-models-autoload"},
2995 string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
2996 [](common_params & params, bool value) {
2997 params.models_autoload = value;
2998 }
2999 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
3000 add_opt(common_arg(
3001 {"--jinja"},
3002 {"--no-jinja"},
3003 string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
3004 [](common_params & params, bool value) {
3005 params.use_jinja = value;
3006 }
3007 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
3008 add_opt(common_arg(
3009 {"--reasoning-format"}, "FORMAT",
3010 "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
3011 "- none: leaves thoughts unparsed in `message.content`\n"
3012 "- deepseek: puts thoughts in `message.reasoning_content`\n"
3013 "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
3014 "(default: auto)",
3015 [](common_params & params, const std::string & value) {
3016 params.reasoning_format = common_reasoning_format_from_name(value);
3017 }
3018 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
3019 add_opt(common_arg(
3020 {"--reasoning-budget"}, "N",
3021 "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
3022 [](common_params & params, int value) {
3023 if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
3024 params.reasoning_budget = value;
3025 }
3026 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
3027 add_opt(common_arg(
3028 {"--chat-template"}, "JINJA_TEMPLATE",
3029 string_format(
3030 "set custom jinja chat template (default: template taken from model's metadata)\n"
3031 "if suffix/prefix are specified, template will be disabled\n"
3032 "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
3033 "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
3034 ),
3035 [](common_params & params, const std::string & value) {
3036 params.chat_template = value;
3037 }
3038 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
3039 add_opt(common_arg(
3040 {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
3041 string_format(
3042 "set custom jinja chat template file (default: template taken from model's metadata)\n"
3043 "if suffix/prefix are specified, template will be disabled\n"
3044 "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
3045 "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
3046 ),
3047 [](common_params & params, const std::string & value) {
3048 params.chat_template = read_file(value);
3049 }
3050 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
3051 add_opt(common_arg(
3052 {"--prefill-assistant"},
3053 {"--no-prefill-assistant"},
3054 string_format(
3055 "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
3056 "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
3057 ),
3058 [](common_params & params, bool value) {
3059 params.prefill_assistant = value;
3060 }
3061 ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
3062 add_opt(common_arg(
3063 {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
3064 string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
3065 [](common_params & params, const std::string & value) {
3066 params.slot_prompt_similarity = std::stof(value);
3067 }
3068 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3069 add_opt(common_arg(
3070 {"--lora-init-without-apply"},
3071 string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
3072 [](common_params & params) {
3073 params.lora_init_without_apply = true;
3074 }
3075 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3076 add_opt(common_arg(
3077 {"--sleep-idle-seconds"}, "SECONDS",
3078 string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
3079 [](common_params & params, int value) {
3080 if (value == 0 || value < -1) {
3081 throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
3082 }
3083 params.sleep_idle_seconds = value;
3084 }
3085 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3086 add_opt(common_arg(
3087 {"--simple-io"},
3088 "use basic IO for better compatibility in subprocesses and limited consoles",
3089 [](common_params & params) {
3090 params.simple_io = true;
3091 }
3092 ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
3093 add_opt(common_arg(
3094 {"--positive-file"}, "FNAME",
3095 string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
3096 [](common_params & params, const std::string & value) {
3097 params.cvector_positive_file = value;
3098 }
3099 ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
3100 add_opt(common_arg(
3101 {"--negative-file"}, "FNAME",
3102 string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
3103 [](common_params & params, const std::string & value) {
3104 params.cvector_negative_file = value;
3105 }
3106 ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
3107 add_opt(common_arg(
3108 {"--pca-batch"}, "N",
3109 string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
3110 [](common_params & params, int value) {
3111 params.n_pca_batch = value;
3112 }
3113 ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
3114 add_opt(common_arg(
3115 {"--pca-iter"}, "N",
3116 string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
3117 [](common_params & params, int value) {
3118 params.n_pca_iterations = value;
3119 }
3120 ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
3121 add_opt(common_arg(
3122 {"--method"}, "{pca, mean}",
3123 "dimensionality reduction method to be used (default: pca)",
3124 [](common_params & params, const std::string & value) {
3125 /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
3126 else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
3127 else { throw std::invalid_argument("invalid value"); }
3128 }
3129 ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
3130 add_opt(common_arg(
3131 {"--output-format"}, "{md,jsonl}",
3132 "output format for batched-bench results (default: md)",
3133 [](common_params & params, const std::string & value) {
3134 /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
3135 else if (value == "md") { params.batched_bench_output_jsonl = false; }
3136 else { throw std::invalid_argument("invalid value"); }
3137 }
3138 ).set_examples({LLAMA_EXAMPLE_BENCH}));
3139 add_opt(common_arg(
3140 {"--log-disable"},
3141 "Log disable",
3142 [](common_params &) {
3143 common_log_pause(common_log_main());
3144 }
3145 ));
3146 add_opt(common_arg(
3147 {"--log-file"}, "FNAME",
3148 "Log to file",
3149 [](common_params &, const std::string & value) {
3150 common_log_set_file(common_log_main(), value.c_str());
3151 }
3152 ).set_env("LLAMA_LOG_FILE"));
3153 add_opt(common_arg(
3154 {"--log-colors"}, "[on|off|auto]",
3155 "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3156 "'auto' enables colors when output is to a terminal",
3157 [](common_params &, const std::string & value) {
3158 if (is_truthy(value)) {
3159 common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3160 } else if (is_falsey(value)) {
3161 common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3162 } else if (is_autoy(value)) {
3163 common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3164 } else {
3165 throw std::invalid_argument(
3166 string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
3167 }
3168 }
3169 ).set_env("LLAMA_LOG_COLORS"));
3170 add_opt(common_arg(
3171 {"-v", "--verbose", "--log-verbose"},
3172 "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
3173 [](common_params & params) {
3174 params.verbosity = INT_MAX;
3175 }
3176 ));
3177 add_opt(common_arg(
3178 {"--offline"},
3179 "Offline mode: forces use of cache, prevents network access",
3180 [](common_params & params) {
3181 params.offline = true;
3182 }
3183 ).set_env("LLAMA_OFFLINE"));
3184 add_opt(common_arg(
3185 {"-lv", "--verbosity", "--log-verbosity"}, "N",
3186 string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
3187 " - 0: generic output\n"
3188 " - 1: error\n"
3189 " - 2: warning\n"
3190 " - 3: info\n"
3191 " - 4: debug\n"
3192 "(default: %d)\n", params.verbosity),
3193 [](common_params & params, int value) {
3194 params.verbosity = value;
3195 }
3196 ).set_env("LLAMA_LOG_VERBOSITY"));
3197 add_opt(common_arg(
3198 {"--log-prefix"},
3199 "Enable prefix in log messages",
3200 [](common_params &) {
3201 common_log_set_prefix(common_log_main(), true);
3202 }
3203 ).set_env("LLAMA_LOG_PREFIX"));
3204 add_opt(common_arg(
3205 {"--log-timestamps"},
3206 "Enable timestamps in log messages",
3207 [](common_params &) {
3208 common_log_set_timestamps(common_log_main(), true);
3209 }
3210 ).set_env("LLAMA_LOG_TIMESTAMPS"));
3211
3212 // speculative parameters
3213 add_opt(common_arg(
3214 {"-td", "--threads-draft"}, "N",
3215 "number of threads to use during generation (default: same as --threads)",
3216 [](common_params & params, int value) {
3217 params.speculative.cpuparams.n_threads = value;
3218 if (params.speculative.cpuparams.n_threads <= 0) {
3219 params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
3220 }
3221 }
3222 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3223 add_opt(common_arg(
3224 {"-tbd", "--threads-batch-draft"}, "N",
3225 "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
3226 [](common_params & params, int value) {
3227 params.speculative.cpuparams_batch.n_threads = value;
3228 if (params.speculative.cpuparams_batch.n_threads <= 0) {
3229 params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
3230 }
3231 }
3232 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3233 add_opt(common_arg(
3234 {"-Cd", "--cpu-mask-draft"}, "M",
3235 "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
3236 [](common_params & params, const std::string & mask) {
3237 params.speculative.cpuparams.mask_valid = true;
3238 if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
3239 throw std::invalid_argument("invalid cpumask");
3240 }
3241 }
3242 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3243 add_opt(common_arg(
3244 {"-Crd", "--cpu-range-draft"}, "lo-hi",
3245 "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
3246 [](common_params & params, const std::string & range) {
3247 params.speculative.cpuparams.mask_valid = true;
3248 if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
3249 throw std::invalid_argument("invalid range");
3250 }
3251 }
3252 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3253 add_opt(common_arg(
3254 {"--cpu-strict-draft"}, "<0|1>",
3255 "Use strict CPU placement for draft model (default: same as --cpu-strict)",
3256 [](common_params & params, int value) {
3257 params.speculative.cpuparams.strict_cpu = value;
3258 }
3259 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3260 add_opt(common_arg(
3261 {"--prio-draft"}, "N",
3262 string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
3263 [](common_params & params, int prio) {
3264 if (prio < 0 || prio > 3) {
3265 throw std::invalid_argument("invalid value");
3266 }
3267 params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
3268 }
3269 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3270 add_opt(common_arg(
3271 {"--poll-draft"}, "<0|1>",
3272 "Use polling to wait for draft model work (default: same as --poll])",
3273 [](common_params & params, int value) {
3274 params.speculative.cpuparams.poll = value;
3275 }
3276 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3277 add_opt(common_arg(
3278 {"-Cbd", "--cpu-mask-batch-draft"}, "M",
3279 "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
3280 [](common_params & params, const std::string & mask) {
3281 params.speculative.cpuparams_batch.mask_valid = true;
3282 if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
3283 throw std::invalid_argument("invalid cpumask");
3284 }
3285 }
3286 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3287 add_opt(common_arg(
3288 {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
3289 "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
3290 [](common_params & params, const std::string & range) {
3291 params.speculative.cpuparams_batch.mask_valid = true;
3292 if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
3293 throw std::invalid_argument("invalid cpumask");
3294 }
3295 }
3296 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3297 add_opt(common_arg(
3298 {"--cpu-strict-batch-draft"}, "<0|1>",
3299 "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
3300 [](common_params & params, int value) {
3301 params.speculative.cpuparams_batch.strict_cpu = value;
3302 }
3303 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3304 add_opt(common_arg(
3305 {"--prio-batch-draft"}, "N",
3306 string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
3307 [](common_params & params, int prio) {
3308 if (prio < 0 || prio > 3) {
3309 throw std::invalid_argument("invalid value");
3310 }
3311 params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
3312 }
3313 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3314 add_opt(common_arg(
3315 {"--poll-batch-draft"}, "<0|1>",
3316 "Use polling to wait for draft model work (default: --poll-draft)",
3317 [](common_params & params, int value) {
3318 params.speculative.cpuparams_batch.poll = value;
3319 }
3320 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3321 add_opt(common_arg(
3322 {"--draft", "--draft-n", "--draft-max"}, "N",
3323 string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
3324 [](common_params & params, int value) {
3325 params.speculative.n_max = value;
3326 }
3327 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
3328 add_opt(common_arg(
3329 {"--draft-min", "--draft-n-min"}, "N",
3330 string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
3331 [](common_params & params, int value) {
3332 params.speculative.n_min = value;
3333 }
3334 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
3335 add_opt(common_arg(
3336 {"--draft-p-split"}, "P",
3337 string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.p_split),
3338 [](common_params & params, const std::string & value) {
3339 params.speculative.p_split = std::stof(value);
3340 }
3341 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
3342 add_opt(common_arg(
3343 {"--draft-p-min"}, "P",
3344 string_format("minimum speculative decoding probability (greedy) (default: %.2f)", (double)params.speculative.p_min),
3345 [](common_params & params, const std::string & value) {
3346 params.speculative.p_min = std::stof(value);
3347 }
3348 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
3349 add_opt(common_arg(
3350 {"-cd", "--ctx-size-draft"}, "N",
3351 string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
3352 [](common_params & params, int value) {
3353 params.speculative.n_ctx = value;
3354 }
3355 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
3356 add_opt(common_arg(
3357 {"-devd", "--device-draft"}, "<dev1,dev2,..>",
3358 "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
3359 "use --list-devices to see a list of available devices",
3360 [](common_params & params, const std::string & value) {
3361 params.speculative.devices = parse_device_list(value);
3362 }
3363 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3364 GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
3365 add_opt(common_arg(
3366 {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
3367 string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
3368 params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
3369 [](common_params & params, const std::string & value) {
3370 if (value == "auto") {
3371 params.speculative.n_gpu_layers = -1;
3372 } else if (value == "all") {
3373 params.speculative.n_gpu_layers = -2;
3374 } else {
3375 params.speculative.n_gpu_layers = std::stoi(value);
3376 }
3377 if (!llama_supports_gpu_offload()) {
3378 fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
3379 fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
3380 fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
3381 }
3382 }
3383 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
3384 add_opt(common_arg(
3385 {"-md", "--model-draft"}, "FNAME",
3386 "draft model for speculative decoding (default: unused)",
3387 [](common_params & params, const std::string & value) {
3388 params.speculative.mparams_dft.path = value;
3389 }
3390 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3391 add_opt(common_arg(
3392 {"--spec-replace"}, "TARGET", "DRAFT",
3393 "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
3394 [](common_params & params, const std::string & tgt, const std::string & dft) {
3395 params.speculative.replacements.push_back({ tgt, dft });
3396 }
3397 ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3398 add_opt(common_arg(
3399 {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
3400 string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
3401 common_speculative_type_to_str(params.speculative.type).c_str()),
3402 [](common_params & params, const std::string & value) {
3403 if (value == "none") {
3404 params.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
3405 } else if (value == "ngram-cache") {
3406 params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
3407 } else if (value == "ngram-simple") {
3408 params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
3409 } else if (value == "ngram-map-k") {
3410 params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
3411 } else if (value == "ngram-map-k4v") {
3412 params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
3413 } else if (value == "ngram-mod") {
3414 params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
3415 } else {
3416 throw std::invalid_argument("unknown speculative decoding type without draft model");
3417 }
3418 }
3419 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3420 add_opt(common_arg(
3421 {"--spec-ngram-size-n"}, "N",
3422 string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.ngram_size_n),
3423 [](common_params & params, int value) {
3424 if (value < 1 || value > 1024) {
3425 throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive");
3426 }
3427 params.speculative.ngram_size_n = value;
3428 }
3429 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3430 add_opt(common_arg(
3431 {"--spec-ngram-size-m"}, "N",
3432 string_format("ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.ngram_size_m),
3433 [](common_params & params, int value) {
3434 if (value < 1 || value > 1024) {
3435 throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive");
3436 }
3437 params.speculative.ngram_size_m = value;
3438 }
3439 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3440 add_opt(common_arg(
3441 {"--spec-ngram-min-hits"}, "N",
3442 string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits),
3443 [](common_params & params, int value) {
3444 if (value < 1) {
3445 throw std::invalid_argument("ngram min hits must be at least 1");
3446 }
3447 params.speculative.ngram_min_hits = value;
3448 }
3449 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3450 add_opt(common_arg(
3451 {"-ctkd", "--cache-type-k-draft"}, "TYPE",
3452 string_format(
3453 "KV cache data type for K for the draft model\n"
3454 "allowed values: %s\n"
3455 "(default: %s)",
3456 get_all_kv_cache_types().c_str(),
3457 ggml_type_name(params.speculative.cache_type_k)
3458 ),
3459 [](common_params & params, const std::string & value) {
3460 params.speculative.cache_type_k = kv_cache_type_from_str(value);
3461 }
3462 ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3463 add_opt(common_arg(
3464 {"-ctvd", "--cache-type-v-draft"}, "TYPE",
3465 string_format(
3466 "KV cache data type for V for the draft model\n"
3467 "allowed values: %s\n"
3468 "(default: %s)",
3469 get_all_kv_cache_types().c_str(),
3470 ggml_type_name(params.speculative.cache_type_v)
3471 ),
3472 [](common_params & params, const std::string & value) {
3473 params.speculative.cache_type_v = kv_cache_type_from_str(value);
3474 }
3475 ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
3476
3477 add_opt(common_arg(
3478 {"-mv", "--model-vocoder"}, "FNAME",
3479 "vocoder model for audio generation (default: unused)",
3480 [](common_params & params, const std::string & value) {
3481 params.vocoder.model.path = value;
3482 }
3483 ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
3484 add_opt(common_arg(
3485 {"--tts-use-guide-tokens"},
3486 "Use guide tokens to improve TTS word recall",
3487 [](common_params & params) {
3488 params.vocoder.use_guide_tokens = true;
3489 }
3490 ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
3491 add_opt(common_arg(
3492 {"--tts-speaker-file"}, "FNAME",
3493 "speaker file path for audio generation",
3494 [](common_params & params, const std::string & value) {
3495 params.vocoder.speaker_file = value;
3496 }
3497 ).set_examples({LLAMA_EXAMPLE_TTS}));
3498
3499 add_opt(common_arg(
3500 {"--diffusion-steps"}, "N",
3501 string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3502 [](common_params & params, int value) { params.diffusion.steps = value; }
3503 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3504 add_opt(common_arg(
3505 {"--diffusion-visual"},
3506 string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
3507 [](common_params & params) { params.diffusion.visual_mode = true; }
3508 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3509 add_opt(common_arg(
3510 {"--diffusion-eps"}, "F",
3511 string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3512 [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3513 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3514 add_opt(common_arg(
3515 {"--diffusion-algorithm"}, "N",
3516 string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
3517 [](common_params & params, int value) { params.diffusion.algorithm = value; }
3518 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3519 add_opt(common_arg(
3520 {"--diffusion-alg-temp"}, "F",
3521 string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3522 [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3523 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3524 add_opt(common_arg(
3525 {"--diffusion-block-length"}, "N",
3526 string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3527 [](common_params & params, int value) { params.diffusion.block_length = value; }
3528 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3529 add_opt(common_arg(
3530 {"--diffusion-cfg-scale"}, "F",
3531 string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3532 [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
3533 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3534 add_opt(common_arg(
3535 {"--diffusion-add-gumbel-noise"}, "F",
3536 string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3537 [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3538 ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3539 add_opt(common_arg(
3540 { "-lr", "--learning-rate" }, "ALPHA",
3541 string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
3542 [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
3543 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3544 add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
3545 string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
3546 (double) params.lr.lr_min),
3547 [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
3548 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3549 add_opt(common_arg(
3550 {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
3551 string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
3552 [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
3553 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3554 add_opt(common_arg(
3555 {"-wd", "--weight-decay"}, "WD",
3556 string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
3557 [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
3558 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3559 add_opt(common_arg(
3560 {"-val-split", "--val-split"}, "FRACTION",
3561 string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
3562 [](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
3563 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3564 add_opt(common_arg(
3565 {"-epochs", "--epochs"}, "N",
3566 string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
3567 [](common_params & params, int epochs) { params.lr.epochs = epochs; }
3568 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3569 add_opt(common_arg(
3570 {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
3571 [](common_params & params, const std::string & name) {
3572 params.optimizer = common_opt_get_optimizer(name.c_str());
3573 if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3574 throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
3575 }
3576 }
3577 ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3578 add_opt(common_arg(
3579 {"--save-logits"},
3580 string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
3581 [](common_params & params) {
3582 params.save_logits = true;
3583 }
3584 ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3585 add_opt(common_arg(
3586 {"--logits-output-dir"}, "PATH",
3587 string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
3588 [](common_params & params, const std::string & value) {
3589 params.logits_output_dir = value;
3590 }
3591 ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3592 add_opt(common_arg(
3593 {"--tensor-filter"}, "REGEX",
3594 "filter tensor names for debug output (regex pattern, can be specified multiple times)",
3595 [](common_params & params, const std::string & value) {
3596 params.tensor_filter.push_back(value);
3597 }
3598 ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3599
3600 // presets
3601 add_opt(common_arg(
3602 {"--tts-oute-default"},
3603 string_format("use default OuteTTS models (note: can download weights from the internet)"),
3604 [](common_params & params) {
3605 params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
3606 params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
3607 params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
3608 params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
3609 }
3610 ).set_examples({LLAMA_EXAMPLE_TTS}));
3611
3612 add_opt(common_arg(
3613 {"--embd-gemma-default"},
3614 string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
3615 [](common_params & params) {
3616 params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
3617 params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
3618 params.port = 8011;
3619 params.n_ubatch = 2048;
3620 params.n_batch = 2048;
3621 params.n_parallel = 32;
3622 params.n_ctx = 2048*params.n_parallel;
3623 params.verbose_prompt = true;
3624 params.embedding = true;
3625 }
3626 ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3627
3628 add_opt(common_arg(
3629 {"--fim-qwen-1.5b-default"},
3630 string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
3631 [](common_params & params) {
3632 params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
3633 params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
3634 params.port = 8012;
3635 params.n_ubatch = 1024;
3636 params.n_batch = 1024;
3637 params.n_ctx = 0;
3638 params.n_cache_reuse = 256;
3639 }
3640 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3641
3642 add_opt(common_arg(
3643 {"--fim-qwen-3b-default"},
3644 string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
3645 [](common_params & params) {
3646 params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
3647 params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
3648 params.port = 8012;
3649 params.n_ubatch = 1024;
3650 params.n_batch = 1024;
3651 params.n_ctx = 0;
3652 params.n_cache_reuse = 256;
3653 }
3654 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3655
3656 add_opt(common_arg(
3657 {"--fim-qwen-7b-default"},
3658 string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
3659 [](common_params & params) {
3660 params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3661 params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3662 params.port = 8012;
3663 params.n_ubatch = 1024;
3664 params.n_batch = 1024;
3665 params.n_ctx = 0;
3666 params.n_cache_reuse = 256;
3667 }
3668 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3669
3670 add_opt(common_arg(
3671 {"--fim-qwen-7b-spec"},
3672 string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
3673 [](common_params & params) {
3674 params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3675 params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3676 params.speculative.mparams_dft.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3677 params.speculative.mparams_dft.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3678 params.port = 8012;
3679 params.n_ubatch = 1024;
3680 params.n_batch = 1024;
3681 params.n_ctx = 0;
3682 params.n_cache_reuse = 256;
3683 }
3684 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3685
3686 add_opt(common_arg(
3687 {"--fim-qwen-14b-spec"},
3688 string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
3689 [](common_params & params) {
3690 params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
3691 params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
3692 params.speculative.mparams_dft.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3693 params.speculative.mparams_dft.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3694 params.port = 8012;
3695 params.n_ubatch = 1024;
3696 params.n_batch = 1024;
3697 params.n_ctx = 0;
3698 params.n_cache_reuse = 256;
3699 }
3700 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3701
3702 add_opt(common_arg(
3703 {"--fim-qwen-30b-default"},
3704 string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
3705 [](common_params & params) {
3706 params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
3707 params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
3708 params.port = 8012;
3709 params.n_ubatch = 1024;
3710 params.n_batch = 1024;
3711 params.n_ctx = 0;
3712 params.n_cache_reuse = 256;
3713 }
3714 ).set_examples({LLAMA_EXAMPLE_SERVER}));
3715
3716 add_opt(common_arg(
3717 {"--gpt-oss-20b-default"},
3718 string_format("use gpt-oss-20b (note: can download weights from the internet)"),
3719 [](common_params & params) {
3720 params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
3721 params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
3722 params.port = 8013;
3723 params.n_ubatch = 2048;
3724 params.n_batch = 32768;
3725 params.n_parallel = 2;
3726 params.n_ctx = 131072*params.n_parallel;
3727 params.sampling.temp = 1.0f;
3728 params.sampling.top_p = 1.0f;
3729 params.sampling.top_k = 0;
3730 params.sampling.min_p = 0.01f;
3731 params.use_jinja = true;
3732 //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3733 }
3734 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3735
3736 add_opt(common_arg(
3737 {"--gpt-oss-120b-default"},
3738 string_format("use gpt-oss-120b (note: can download weights from the internet)"),
3739 [](common_params & params) {
3740 params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
3741 params.port = 8013;
3742 params.n_ubatch = 2048;
3743 params.n_batch = 32768;
3744 params.n_parallel = 2;
3745 params.n_ctx = 131072*params.n_parallel;
3746 params.sampling.temp = 1.0f;
3747 params.sampling.top_p = 1.0f;
3748 params.sampling.top_k = 0;
3749 params.sampling.min_p = 0.01f;
3750 params.use_jinja = true;
3751 //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3752 }
3753 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3754
3755 add_opt(common_arg(
3756 {"--vision-gemma-4b-default"},
3757 string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
3758 [](common_params & params) {
3759 params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
3760 params.port = 8014;
3761 params.n_ctx = 0;
3762 params.use_jinja = true;
3763 }
3764 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3765
3766 add_opt(common_arg(
3767 {"--vision-gemma-12b-default"},
3768 string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
3769 [](common_params & params) {
3770 params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
3771 params.port = 8014;
3772 params.n_ctx = 0;
3773 params.use_jinja = true;
3774 }
3775 ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3776
3777 return ctx_arg;
3778}
3779
3780void common_params_add_preset_options(std::vector<common_arg> & args) {
3781 // arguments below won't be treated as CLI args, only preset options
3782 args.push_back(common_arg(
3783 {"load-on-startup"}, "NAME",
3784 "in server router mode, autoload this model on startup",
3785 [](common_params &, const std::string &) { /* unused */ }
3786 ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
3787
3788 args.push_back(common_arg(
3789 {"stop-timeout"}, "SECONDS",
3790 "in server router mode, force-kill model instance after this many seconds of graceful shutdown",
3791 [](common_params &, int) { /* unused */ }
3792 ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
3793
3794 // args.push_back(common_arg(
3795 // {"pin"},
3796 // "in server router mode, do not unload this model if models_max is exceeded",
3797 // [](common_params &) { /* unused */ }
3798 // ).set_preset_only());
3799}