llmnpc - llama.cpp/common/arg.cpp

Path: llmnpc / llama.cpp / common / arg.cpp (raw)
   1#include "arg.h"
   2
   3#include "chat.h"
   4#include "common.h"
   5#include "download.h"
   6#include "json-schema-to-grammar.h"
   7#include "log.h"
   8#include "sampling.h"
   9#include "speculative.h"
  10#include "preset.h"
  11
  12// fix problem with std::min and std::max
  13#if defined(_WIN32)
  14#define WIN32_LEAN_AND_MEAN
  15#ifndef NOMINMAX
  16#   define NOMINMAX
  17#endif
  18#include <windows.h>
  19#endif
  20
  21#define JSON_ASSERT GGML_ASSERT
  22#include <nlohmann/json.hpp>
  23
  24#include <algorithm>
  25#include <cinttypes>
  26#include <climits>
  27#include <cstdarg>
  28#include <fstream>
  29#include <list>
  30#include <regex>
  31#include <set>
  32#include <string>
  33#include <thread> // for hardware_concurrency
  34#include <vector>
  35
  36#ifndef __EMSCRIPTEN__
  37#ifdef __linux__
  38#include <linux/limits.h>
  39#elif defined(_WIN32)
  40#   if !defined(PATH_MAX)
  41#   define PATH_MAX MAX_PATH
  42#   endif
  43#elif defined(_AIX)
  44#include <sys/limits.h>
  45#else
  46#include <sys/syslimits.h>
  47#endif
  48#endif
  49
  50#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
  51
  52extern const char * LICENSES[];
  53
  54using json = nlohmann::ordered_json;
  55using namespace common_arg_utils;
  56
  57static std::initializer_list<enum llama_example> mmproj_examples = {
  58    LLAMA_EXAMPLE_MTMD,
  59    LLAMA_EXAMPLE_SERVER,
  60    LLAMA_EXAMPLE_CLI,
  61};
  62
  63static std::string read_file(const std::string & fname) {
  64    std::ifstream file(fname);
  65    if (!file) {
  66        throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
  67    }
  68    std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
  69    file.close();
  70    return content;
  71}
  72
  73static const std::vector<common_arg> & get_common_arg_defs() {
  74    static const std::vector<common_arg> options = [] {
  75        common_params params;
  76        auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
  77        return ctx.options;
  78    }();
  79    return options;
  80}
  81
  82common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
  83    this->examples = examples;
  84    return *this;
  85}
  86
  87common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
  88    this->excludes = excludes;
  89    return *this;
  90}
  91
  92common_arg & common_arg::set_env(const char * env) {
  93    help = help + "\n(env: " + env + ")";
  94    this->env = env;
  95    return *this;
  96}
  97
  98common_arg & common_arg::set_sparam() {
  99    is_sparam = true;
 100    return *this;
 101}
 102
 103common_arg & common_arg::set_preset_only() {
 104    is_preset_only = true;
 105    return *this;
 106}
 107
 108bool common_arg::in_example(enum llama_example ex) {
 109    return examples.find(ex) != examples.end();
 110}
 111
 112bool common_arg::is_exclude(enum llama_example ex) {
 113    return excludes.find(ex) != excludes.end();
 114}
 115
 116bool common_arg::get_value_from_env(std::string & output) const {
 117    if (env == nullptr) return false;
 118    if (!args_neg.empty()) {
 119        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
 120        std::string neg_env = env;
 121        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
 122        char * neg_value = std::getenv(neg_env.c_str());
 123        if (neg_value) {
 124            output = "0"; // falsey
 125            return true;
 126        }
 127    }
 128    char * value = std::getenv(env);
 129    if (value) {
 130        output = value;
 131        return true;
 132    }
 133    return false;
 134}
 135
 136bool common_arg::has_value_from_env() const {
 137    if (env != nullptr && !args_neg.empty()) {
 138        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
 139        std::string neg_env = env;
 140        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
 141        if (std::getenv(neg_env.c_str())) {
 142            return true;
 143        }
 144    }
 145    return env != nullptr && std::getenv(env);
 146}
 147
 148static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
 149    std::vector<std::string> result;
 150    std::istringstream iss(input);
 151    std::string line;
 152    auto add_line = [&](const std::string& l) {
 153        if (l.length() <= max_char_per_line) {
 154            result.push_back(l);
 155        } else {
 156            std::istringstream line_stream(l);
 157            std::string word, current_line;
 158            while (line_stream >> word) {
 159                if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
 160                    if (!current_line.empty()) result.push_back(current_line);
 161                    current_line = word;
 162                } else {
 163                    current_line += (!current_line.empty() ? " " : "") + word;
 164                }
 165            }
 166            if (!current_line.empty()) result.push_back(current_line);
 167        }
 168    };
 169    while (std::getline(iss, line)) {
 170        add_line(line);
 171    }
 172    return result;
 173}
 174
 175std::string common_arg::to_string() const {
 176    // params for printing to console
 177    const static int n_leading_spaces = 40;
 178    const static int n_char_per_line_help = 70; // TODO: detect this based on current console
 179    std::string leading_spaces(n_leading_spaces, ' ');
 180
 181    std::ostringstream ss;
 182    auto all_args = get_args(); // also contains args_neg
 183    for (const auto & arg : all_args) {
 184        if (arg == all_args.front()) {
 185            if (all_args.size() == 1) {
 186                ss << arg;
 187            } else {
 188                // first arg is usually abbreviation, we need padding to make it more beautiful
 189                auto tmp = std::string(arg) + ", ";
 190                auto spaces = std::string(std::max(0, 7 - (int)tmp.size()), ' ');
 191                ss << tmp << spaces;
 192            }
 193        } else {
 194            ss << arg << (arg != all_args.back() ? ", " : "");
 195        }
 196    }
 197    if (value_hint) ss << " " << value_hint;
 198    if (value_hint_2) ss << " " << value_hint_2;
 199    if (ss.tellp() > n_leading_spaces - 3) {
 200        // current line is too long, add new line
 201        ss << "\n" << leading_spaces;
 202    } else {
 203        // padding between arg and help, same line
 204        ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
 205    }
 206    const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
 207    for (const auto & line : help_lines) {
 208        ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
 209    }
 210    return ss.str();
 211}
 212
 213std::vector<std::string> common_arg::get_args() const {
 214    std::vector<std::string> result;
 215    for (const auto & arg : args) {
 216        result.push_back(std::string(arg));
 217    }
 218    for (const auto & arg : args_neg) {
 219        result.push_back(std::string(arg));
 220    }
 221    return result;
 222}
 223
 224std::vector<std::string> common_arg::get_env() const {
 225    std::vector<std::string> result;
 226    if (env) {
 227        result.push_back(std::string(env));
 228    }
 229    if (!args_neg.empty() && env) {
 230        // for compatibility, we need to add LLAMA_ARG_NO_ variant
 231        std::string neg_env = env;
 232        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
 233        result.push_back(neg_env);
 234    }
 235    return result;
 236}
 237
 238//
 239// utils
 240//
 241
 242// Helper function to parse tensor buffer override strings
 243static void parse_tensor_buffer_overrides(const std::string & value, std::vector<llama_model_tensor_buft_override> & overrides) {
 244    std::map<std::string, ggml_backend_buffer_type_t> buft_list;
 245    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
 246        auto * dev = ggml_backend_dev_get(i);
 247        auto * buft = ggml_backend_dev_buffer_type(dev);
 248        if (buft) {
 249            buft_list[ggml_backend_buft_name(buft)] = buft;
 250        }
 251    }
 252
 253    for (const auto & override : string_split<std::string>(value, ',')) {
 254        std::string::size_type pos = override.find('=');
 255        if (pos == std::string::npos) {
 256            throw std::invalid_argument("invalid value");
 257        }
 258        std::string tensor_name = override.substr(0, pos);
 259        std::string buffer_type = override.substr(pos + 1);
 260
 261        if (buft_list.find(buffer_type) == buft_list.end()) {
 262            printf("Available buffer types:\n");
 263            for (const auto & it : buft_list) {
 264                printf("  %s\n", ggml_backend_buft_name(it.second));
 265            }
 266            throw std::invalid_argument("unknown buffer type");
 267        }
 268        // keep strings alive and avoid leaking memory by storing them in a static vector
 269        static std::list<std::string> buft_overrides;
 270        buft_overrides.push_back(tensor_name);
 271        overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
 272    }
 273}
 274
 275static std::string clean_file_name(const std::string & fname) {
 276    std::string clean_fname = fname;
 277    string_replace_all(clean_fname, "\\", "_");
 278    string_replace_all(clean_fname, "/", "_");
 279    return clean_fname;
 280}
 281
 282static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
 283    GGML_ASSERT(!params.model.hf_repo.empty());
 284
 285    // the returned hf_repo is without tag
 286    auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
 287
 288    // "latest" tag (default if not specified) is translated to "default" preset
 289    if (hf_tag == "latest") {
 290        hf_tag = "default";
 291    }
 292
 293    const bool offline = params.offline;
 294    std::string model_endpoint = get_model_endpoint();
 295    auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
 296
 297    // prepare local path for caching
 298    auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
 299    auto preset_path = fs_get_cache_file(preset_fname);
 300    const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
 301    const bool has_preset = status >= 200 && status < 400;
 302
 303    // remote preset is optional, so we don't error out if not found
 304    if (has_preset) {
 305        LOG_INF("applying remote preset from %s\n", preset_url.c_str());
 306        common_preset_context ctx(ex, /* only_remote_allowed */ true);
 307        common_preset global;
 308        auto remote_presets = ctx.load_from_ini(preset_path, global);
 309        remote_presets = ctx.cascade(global, remote_presets);
 310        if (remote_presets.find(hf_tag) != remote_presets.end()) {
 311            common_preset preset = remote_presets.at(hf_tag);
 312            LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
 313            preset.apply_to_params(params);
 314        } else {
 315            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
 316        }
 317    } else {
 318        LOG_INF("%s", "no remote preset found, skipping\n");
 319    }
 320
 321    return has_preset;
 322}
 323
 324struct handle_model_result {
 325    bool found_mmproj = false;
 326    common_params_model mmproj;
 327};
 328
 329static handle_model_result common_params_handle_model(
 330        struct common_params_model & model,
 331        const std::string & bearer_token,
 332        bool offline) {
 333    handle_model_result result;
 334    // handle pre-fill default model path and url based on hf_repo and hf_file
 335    {
 336        if (!model.docker_repo.empty()) {  // Handle Docker URLs by resolving them to local paths
 337            model.path = common_docker_resolve_model(model.docker_repo);
 338            model.name = model.docker_repo; // set name for consistency
 339        } else if (!model.hf_repo.empty()) {
 340            // short-hand to avoid specifying --hf-file -> default it to --model
 341            if (model.hf_file.empty()) {
 342                if (model.path.empty()) {
 343                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
 344                    if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
 345                        exit(1); // error message already printed
 346                    }
 347                    model.name    = model.hf_repo;      // repo name with tag
 348                    model.hf_repo = auto_detected.repo; // repo name without tag
 349                    model.hf_file = auto_detected.ggufFile;
 350                    if (!auto_detected.mmprojFile.empty()) {
 351                        result.found_mmproj   = true;
 352                        result.mmproj.hf_repo = model.hf_repo;
 353                        result.mmproj.hf_file = auto_detected.mmprojFile;
 354                    }
 355                } else {
 356                    model.hf_file = model.path;
 357                }
 358            }
 359
 360            std::string model_endpoint = get_model_endpoint();
 361            model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
 362            // make sure model path is present (for caching purposes)
 363            if (model.path.empty()) {
 364                // this is to avoid different repo having same file name, or same file name in different subdirs
 365                std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
 366                model.path = fs_get_cache_file(filename);
 367            }
 368
 369        } else if (!model.url.empty()) {
 370            if (model.path.empty()) {
 371                auto f = string_split<std::string>(model.url, '#').front();
 372                f = string_split<std::string>(f, '?').front();
 373                model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
 374            }
 375
 376        }
 377    }
 378
 379    // then, download it if needed
 380    if (!model.url.empty()) {
 381        bool ok = common_download_model(model, bearer_token, offline);
 382        if (!ok) {
 383            LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
 384            exit(1);
 385        }
 386    }
 387
 388    return result;
 389}
 390
 391const std::vector<ggml_type> kv_cache_types = {
 392    GGML_TYPE_F32,
 393    GGML_TYPE_F16,
 394    GGML_TYPE_BF16,
 395    GGML_TYPE_Q8_0,
 396    GGML_TYPE_Q4_0,
 397    GGML_TYPE_Q4_1,
 398    GGML_TYPE_IQ4_NL,
 399    GGML_TYPE_Q5_0,
 400    GGML_TYPE_Q5_1,
 401};
 402
 403static ggml_type kv_cache_type_from_str(const std::string & s) {
 404    for (const auto & type : kv_cache_types) {
 405        if (ggml_type_name(type) == s) {
 406            return type;
 407        }
 408    }
 409    throw std::runtime_error("Unsupported cache type: " + s);
 410}
 411
 412static std::string get_all_kv_cache_types() {
 413    std::ostringstream msg;
 414    for (const auto & type : kv_cache_types) {
 415        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
 416    }
 417    return msg.str();
 418}
 419
 420static bool parse_bool_value(const std::string & value) {
 421    if (is_truthy(value)) {
 422        return true;
 423    } else if (is_falsey(value)) {
 424        return false;
 425    } else {
 426        throw std::invalid_argument("invalid boolean value");
 427    }
 428}
 429
 430//
 431// CLI argument parsing functions
 432//
 433
 434static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
 435    common_params & params = ctx_arg.params;
 436
 437    std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
 438    for (auto & opt : ctx_arg.options) {
 439        for (const auto & arg : opt.args) {
 440            arg_to_options[arg] = {&opt, /* is_positive */ true};
 441        }
 442        for (const auto & arg : opt.args_neg) {
 443            arg_to_options[arg] = {&opt, /* is_positive */ false};
 444        }
 445    }
 446
 447    // handle environment variables
 448    for (auto & opt : ctx_arg.options) {
 449        std::string value;
 450        if (opt.get_value_from_env(value)) {
 451            try {
 452                if (opt.handler_void && is_truthy(value)) {
 453                    opt.handler_void(params);
 454                }
 455                if (opt.handler_int) {
 456                    opt.handler_int(params, std::stoi(value));
 457                }
 458                if (opt.handler_bool) {
 459                    opt.handler_bool(params, parse_bool_value(value));
 460                }
 461                if (opt.handler_string) {
 462                    opt.handler_string(params, value);
 463                    continue;
 464                }
 465            } catch (std::exception & e) {
 466                throw std::invalid_argument(string_format(
 467                    "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
 468            }
 469        }
 470    }
 471
 472    // handle command line arguments
 473    auto check_arg = [&](int i) {
 474        if (i+1 >= argc) {
 475            throw std::invalid_argument("expected value for argument");
 476        }
 477    };
 478
 479    auto parse_cli_args = [&]() {
 480        std::set<std::string> seen_args;
 481
 482        for (int i = 1; i < argc; i++) {
 483            const std::string arg_prefix = "--";
 484
 485            std::string arg = argv[i];
 486            if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
 487                std::replace(arg.begin(), arg.end(), '_', '-');
 488            }
 489            if (arg_to_options.find(arg) == arg_to_options.end()) {
 490                throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
 491            }
 492            if (!seen_args.insert(arg).second) {
 493                LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
 494            }
 495            auto & tmp = arg_to_options[arg];
 496            auto opt = *tmp.first;
 497            bool is_positive = tmp.second;
 498            if (opt.has_value_from_env()) {
 499                fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
 500            }
 501            try {
 502                if (opt.handler_void) {
 503                    opt.handler_void(params);
 504                    continue;
 505                }
 506                if (opt.handler_bool) {
 507                    opt.handler_bool(params, is_positive);
 508                    continue;
 509                }
 510
 511                // arg with single value
 512                check_arg(i);
 513                std::string val = argv[++i];
 514                if (opt.handler_int) {
 515                    opt.handler_int(params, std::stoi(val));
 516                    continue;
 517                }
 518                if (opt.handler_string) {
 519                    opt.handler_string(params, val);
 520                    continue;
 521                }
 522
 523                // arg with 2 values
 524                check_arg(i);
 525                std::string val2 = argv[++i];
 526                if (opt.handler_str_str) {
 527                    opt.handler_str_str(params, val, val2);
 528                    continue;
 529                }
 530            } catch (std::exception & e) {
 531                throw std::invalid_argument(string_format(
 532                    "error while handling argument \"%s\": %s\n\n"
 533                    "usage:\n%s\n\nto show complete usage, run with -h",
 534                    arg.c_str(), e.what(), opt.to_string().c_str()));
 535            }
 536        }
 537    };
 538
 539    // parse the first time to get -hf option (used for remote preset)
 540    parse_cli_args();
 541
 542    // maybe handle remote preset
 543    if (!params.model.hf_repo.empty()) {
 544        std::string cli_hf_repo = params.model.hf_repo;
 545        bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
 546
 547        // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
 548        // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
 549        std::string preset_hf_repo = params.model.hf_repo;
 550        bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
 551
 552        if (has_preset) {
 553            // re-parse CLI args to override preset values
 554            parse_cli_args();
 555        }
 556
 557        // preserve hf_repo from preset if needed
 558        if (preset_has_hf_repo) {
 559            params.model.hf_repo = preset_hf_repo;
 560        }
 561    }
 562
 563    postprocess_cpu_params(params.cpuparams,       nullptr);
 564    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
 565
 566    postprocess_cpu_params(params.speculative.cpuparams,       &params.cpuparams);
 567    postprocess_cpu_params(params.speculative.cpuparams_batch, &params.cpuparams_batch);
 568
 569    if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
 570        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
 571    }
 572
 573    // handle model and download
 574    {
 575        auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
 576        if (params.no_mmproj) {
 577            params.mmproj = {};
 578        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
 579            // optionally, handle mmproj model when -hf is specified
 580            params.mmproj = res.mmproj;
 581        }
 582        // only download mmproj if the current example is using it
 583        for (const auto & ex : mmproj_examples) {
 584            if (ctx_arg.ex == ex) {
 585                common_params_handle_model(params.mmproj,    params.hf_token, params.offline);
 586                break;
 587            }
 588        }
 589        common_params_handle_model(params.speculative.mparams_dft, params.hf_token, params.offline);
 590        common_params_handle_model(params.vocoder.model,           params.hf_token, params.offline);
 591    }
 592
 593    // model is required (except for server)
 594    // TODO @ngxson : maybe show a list of available models in CLI in this case
 595    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
 596        throw std::invalid_argument("error: --model is required\n");
 597    }
 598
 599    if (params.escape) {
 600        string_process_escapes(params.prompt);
 601        string_process_escapes(params.input_prefix);
 602        string_process_escapes(params.input_suffix);
 603        for (auto & antiprompt : params.antiprompt) {
 604            string_process_escapes(antiprompt);
 605        }
 606        for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
 607            string_process_escapes(seq_breaker);
 608        }
 609        for (auto & pair : params.speculative.replacements) {
 610            string_process_escapes(pair.first);
 611            string_process_escapes(pair.second);
 612        }
 613    }
 614
 615    if (!params.kv_overrides.empty()) {
 616        params.kv_overrides.emplace_back();
 617        params.kv_overrides.back().key[0] = 0;
 618    }
 619
 620    // pad tensor_buft_overrides for llama_params_fit:
 621    const size_t ntbo = llama_max_tensor_buft_overrides();
 622    while (params.tensor_buft_overrides.size() < ntbo) {
 623        params.tensor_buft_overrides.push_back({nullptr, nullptr});
 624    }
 625
 626    if (!params.speculative.tensor_buft_overrides.empty()) {
 627        params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr});
 628    }
 629
 630    if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
 631        throw std::runtime_error(string_format(
 632            "error: the supplied chat template is not supported: %s%s\n",
 633            params.chat_template.c_str(),
 634            params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates"
 635        ));
 636    }
 637
 638    common_log_set_verbosity_thold(params.verbosity);
 639
 640    return true;
 641}
 642
 643static void common_params_print_usage(common_params_context & ctx_arg) {
 644    auto print_options = [](std::vector<common_arg *> & options) {
 645        for (common_arg * opt : options) {
 646            printf("%s", opt->to_string().c_str());
 647        }
 648    };
 649
 650    std::vector<common_arg *> common_options;
 651    std::vector<common_arg *> sparam_options;
 652    std::vector<common_arg *> specific_options;
 653    for (auto & opt : ctx_arg.options) {
 654        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
 655        if (opt.is_sparam) {
 656            sparam_options.push_back(&opt);
 657        } else if (opt.in_example(ctx_arg.ex)) {
 658            specific_options.push_back(&opt);
 659        } else {
 660            common_options.push_back(&opt);
 661        }
 662    }
 663    printf("----- common params -----\n\n");
 664    print_options(common_options);
 665    printf("\n\n----- sampling params -----\n\n");
 666    print_options(sparam_options);
 667    // TODO: maybe convert enum llama_example to string
 668    printf("\n\n----- example-specific params -----\n\n");
 669    print_options(specific_options);
 670}
 671
 672static void common_params_print_completion(common_params_context & ctx_arg) {
 673    std::vector<common_arg *> common_options;
 674    std::vector<common_arg *> sparam_options;
 675    std::vector<common_arg *> specific_options;
 676
 677    for (auto & opt : ctx_arg.options) {
 678        if (opt.is_sparam) {
 679            sparam_options.push_back(&opt);
 680        } else if (opt.in_example(ctx_arg.ex)) {
 681            specific_options.push_back(&opt);
 682        } else {
 683            common_options.push_back(&opt);
 684        }
 685    }
 686
 687    printf("_llama_completions() {\n");
 688    printf("    local cur prev opts\n");
 689    printf("    COMPREPLY=()\n");
 690    printf("    cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
 691    printf("    prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
 692
 693    printf("    opts=\"");
 694    auto print_options = [](const std::vector<common_arg *> & options) {
 695        for (const common_arg * opt : options) {
 696            for (const char * arg : opt->args) {
 697                printf("%s ", arg);
 698            }
 699        }
 700    };
 701
 702    print_options(common_options);
 703    print_options(sparam_options);
 704    print_options(specific_options);
 705    printf("\"\n\n");
 706
 707    printf("    case \"$prev\" in\n");
 708    printf("        --model|-m)\n");
 709    printf("            COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
 710    printf("            return 0\n");
 711    printf("            ;;\n");
 712    printf("        --grammar-file)\n");
 713    printf("            COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
 714    printf("            return 0\n");
 715    printf("            ;;\n");
 716    printf("        --chat-template-file)\n");
 717    printf("            COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
 718    printf("            return 0\n");
 719    printf("            ;;\n");
 720    printf("        *)\n");
 721    printf("            COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
 722    printf("            return 0\n");
 723    printf("            ;;\n");
 724    printf("    esac\n");
 725    printf("}\n\n");
 726
 727    std::set<std::string> executables = {
 728        "llama-batched",
 729        "llama-batched-bench",
 730        "llama-bench",
 731        "llama-cli",
 732        "llama-completion",
 733        "llama-convert-llama2c-to-ggml",
 734        "llama-cvector-generator",
 735        "llama-embedding",
 736        "llama-eval-callback",
 737        "llama-export-lora",
 738        "llama-gen-docs",
 739        "llama-gguf",
 740        "llama-gguf-hash",
 741        "llama-gguf-split",
 742        "llama-gritlm",
 743        "llama-imatrix",
 744        "llama-infill",
 745        "llama-mtmd-cli",
 746        "llama-llava-clip-quantize-cli",
 747        "llama-lookahead",
 748        "llama-lookup",
 749        "llama-lookup-create",
 750        "llama-lookup-merge",
 751        "llama-lookup-stats",
 752        "llama-parallel",
 753        "llama-passkey",
 754        "llama-perplexity",
 755        "llama-q8dot",
 756        "llama-quantize",
 757        "llama-qwen2vl-cli",
 758        "llama-retrieval",
 759        "llama-save-load-state",
 760        "llama-server",
 761        "llama-simple",
 762        "llama-simple-chat",
 763        "llama-speculative",
 764        "llama-speculative-simple",
 765        "llama-tokenize",
 766        "llama-tts",
 767        "llama-vdot"
 768    };
 769
 770    for (const auto& exe : executables) {
 771        printf("complete -F _llama_completions %s\n", exe.c_str());
 772    }
 773}
 774
 775static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
 776    std::vector<ggml_backend_dev_t> devices;
 777    auto dev_names = string_split<std::string>(value, ',');
 778    if (dev_names.empty()) {
 779        throw std::invalid_argument("no devices specified");
 780    }
 781    if (dev_names.size() == 1 && dev_names[0] == "none") {
 782        devices.push_back(nullptr);
 783    } else {
 784        for (const auto & device : dev_names) {
 785            auto * dev = ggml_backend_dev_by_name(device.c_str());
 786            if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
 787                throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
 788            }
 789            devices.push_back(dev);
 790        }
 791        devices.push_back(nullptr);
 792    }
 793    return devices;
 794}
 795
 796static void add_rpc_devices(const std::string & servers) {
 797    auto rpc_servers = string_split<std::string>(servers, ',');
 798    if (rpc_servers.empty()) {
 799        throw std::invalid_argument("no RPC servers specified");
 800    }
 801    ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
 802    if (!rpc_reg) {
 803        throw std::invalid_argument("failed to find RPC backend");
 804    }
 805    typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char * endpoint);
 806    ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
 807    if (!ggml_backend_rpc_add_server_fn) {
 808        throw std::invalid_argument("failed to find RPC add server function");
 809    }
 810    for (const auto & server : rpc_servers) {
 811        auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
 812        ggml_backend_register(reg);
 813    }
 814}
 815
 816bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
 817    common_params dummy_params;
 818    common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
 819
 820    std::unordered_map<std::string, common_arg *> arg_to_options;
 821    for (auto & opt : ctx_arg.options) {
 822        for (const auto & arg : opt.args) {
 823            arg_to_options[arg] = &opt;
 824        }
 825        for (const auto & arg : opt.args_neg) {
 826            arg_to_options[arg] = &opt;
 827        }
 828    }
 829
 830    // TODO @ngxson : find a way to deduplicate this code
 831
 832    // handle command line arguments
 833    auto check_arg = [&](int i) {
 834        if (i+1 >= argc) {
 835            throw std::invalid_argument("expected value for argument");
 836        }
 837    };
 838
 839    std::set<std::string> seen_args;
 840
 841    for (int i = 1; i < argc; i++) {
 842        const std::string arg_prefix = "--";
 843
 844        std::string arg = argv[i];
 845        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
 846            std::replace(arg.begin(), arg.end(), '_', '-');
 847        }
 848        if (arg_to_options.find(arg) == arg_to_options.end()) {
 849            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
 850        }
 851        if (!seen_args.insert(arg).second) {
 852            LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
 853        }
 854        auto opt = *arg_to_options[arg];
 855        std::string val;
 856        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
 857            // bool arg (need to reverse the meaning for negative args)
 858            bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
 859            val = is_neg ? "0" : "1";
 860        }
 861        if (opt.value_hint != nullptr) {
 862            // arg with single value
 863            check_arg(i);
 864            val = argv[++i];
 865        }
 866        if (opt.value_hint_2 != nullptr) {
 867            // TODO: support arg with 2 values
 868            throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
 869        }
 870        out_map[opt] = val;
 871    }
 872
 873    return true;
 874}
 875
 876bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
 877    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
 878    const common_params params_org = ctx_arg.params; // the example can modify the default params
 879
 880    try {
 881        if (!common_params_parse_ex(argc, argv, ctx_arg)) {
 882            ctx_arg.params = params_org;
 883            return false;
 884        }
 885        if (ctx_arg.params.usage) {
 886            common_params_print_usage(ctx_arg);
 887            if (ctx_arg.print_usage) {
 888                ctx_arg.print_usage(argc, argv);
 889            }
 890            exit(0);
 891        }
 892        if (ctx_arg.params.completion) {
 893            common_params_print_completion(ctx_arg);
 894            exit(0);
 895        }
 896        params.lr.init();
 897    } catch (const std::invalid_argument & ex) {
 898        fprintf(stderr, "%s\n", ex.what());
 899        ctx_arg.params = params_org;
 900        return false;
 901    } catch (std::exception & ex) {
 902        fprintf(stderr, "%s\n", ex.what());
 903        exit(1); // for other exceptions, we exit with status code 1
 904    }
 905
 906    return true;
 907}
 908
 909static std::string list_builtin_chat_templates() {
 910    std::vector<const char *> supported_tmpl;
 911    int32_t res = llama_chat_builtin_templates(nullptr, 0);
 912    supported_tmpl.resize(res);
 913    res = llama_chat_builtin_templates(supported_tmpl.data(), supported_tmpl.size());
 914    std::ostringstream msg;
 915    for (auto & tmpl : supported_tmpl) {
 916        msg << tmpl << (&tmpl == &supported_tmpl.back() ? "" : ", ");
 917    }
 918    return msg.str();
 919}
 920
 921bool common_arg_utils::is_truthy(const std::string & value) {
 922    return value == "on" || value == "enabled" || value == "true" || value == "1";
 923}
 924
 925bool common_arg_utils::is_falsey(const std::string & value) {
 926    return value == "off" || value == "disabled" || value == "false" || value == "0";
 927}
 928
 929bool common_arg_utils::is_autoy(const std::string & value) {
 930    return value == "auto" || value == "-1";
 931}
 932
 933// Simple CSV parser that handles quoted fields and escaped quotes
 934// example:
 935//    input:  value1,"value, with, commas","value with ""escaped"" quotes",value4
 936//    output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
 937static std::vector<std::string> parse_csv_row(const std::string& input) {
 938    std::vector<std::string> fields;
 939    std::string field;
 940    bool in_quotes = false;
 941
 942    for (size_t i = 0; i < input.length(); ++i) {
 943        char ch = input[i];
 944
 945        if (ch == '"') {
 946            if (!in_quotes) {
 947                // start of quoted field (only valid if at beginning of field)
 948                if (!field.empty()) {
 949                    // quote appeared in middle of unquoted field, treat as literal
 950                    field += '"';
 951                } else {
 952                    in_quotes = true; // start
 953                }
 954            } else {
 955                if (i + 1 < input.length() && input[i + 1] == '"') {
 956                    // escaped quote: ""
 957                    field += '"';
 958                    ++i; // skip the next quote
 959                } else {
 960                    in_quotes = false; // end
 961                }
 962            }
 963        } else if (ch == ',') {
 964            if (in_quotes) {
 965                field += ',';
 966            } else {
 967                fields.push_back(std::move(field));
 968                field.clear();
 969            }
 970        } else {
 971            field += ch;
 972        }
 973    }
 974
 975    // Add the last field
 976    fields.push_back(std::move(field));
 977
 978    return fields;
 979}
 980
 981common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
 982    // per-example default params
 983    // we define here to make sure it's included in llama-gen-docs
 984    if (ex == LLAMA_EXAMPLE_COMPLETION) {
 985        params.use_jinja = false;   // disable jinja by default
 986
 987    } else if (ex == LLAMA_EXAMPLE_MTMD) {
 988        params.use_jinja = false;   // disable jinja by default
 989        params.sampling.temp = 0.2; // lower temp by default for better quality
 990
 991    } else if (ex == LLAMA_EXAMPLE_SERVER) {
 992        params.n_parallel = -1;     // auto by default
 993    }
 994
 995    params.use_color = tty_can_use_colors();
 996
 997    // load dynamic backends
 998    ggml_backend_load_all();
 999
1000    common_params_context ctx_arg(params);
1001    ctx_arg.print_usage = print_usage;
1002    ctx_arg.ex          = ex;
1003
1004    std::string sampler_type_chars;
1005    std::string sampler_type_names;
1006    for (const auto & sampler : params.sampling.samplers) {
1007        sampler_type_chars += common_sampler_type_to_chr(sampler);
1008        sampler_type_names += common_sampler_type_to_str(sampler) + ";";
1009    }
1010    if (!sampler_type_names.empty()) {
1011        sampler_type_names.pop_back(); // remove last semicolon
1012    }
1013
1014
1015    /**
1016     * filter options by example
1017     * rules:
1018     * - all examples inherit options from LLAMA_EXAMPLE_COMMON
1019     * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
1020     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
1021     */
1022    auto add_opt = [&](common_arg arg) {
1023        if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) {
1024            ctx_arg.options.push_back(std::move(arg));
1025        }
1026    };
1027
1028
1029    add_opt(common_arg(
1030        {"-h", "--help", "--usage"},
1031        "print usage and exit",
1032        [](common_params & params) {
1033            params.usage = true;
1034        }
1035    ));
1036    add_opt(common_arg(
1037        {"--version"},
1038        "show version and build info",
1039        [](common_params &) {
1040            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
1041            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
1042            exit(0);
1043        }
1044    ));
1045    add_opt(common_arg(
1046        {"--license"},
1047        "show source code license and dependencies",
1048        [](common_params &) {
1049            for (int i = 0; LICENSES[i]; ++i) {
1050                printf("%s\n", LICENSES[i]);
1051            }
1052            exit(0);
1053        }
1054    ));
1055    add_opt(common_arg(
1056        {"-cl", "--cache-list"},
1057        "show list of models in cache",
1058        [](common_params &) {
1059            printf("model cache directory: %s\n", fs_get_cache_directory().c_str());
1060            auto models = common_list_cached_models();
1061            printf("number of models in cache: %zu\n", models.size());
1062            for (size_t i = 0; i < models.size(); i++) {
1063                auto & model = models[i];
1064                printf("%4d. %s\n", (int) i + 1, model.to_string().c_str());
1065            }
1066            exit(0);
1067        }
1068    ));
1069    add_opt(common_arg(
1070        {"--completion-bash"},
1071        "print source-able bash completion script for llama.cpp",
1072        [](common_params & params) {
1073            params.completion = true;
1074        }
1075    ));
1076    add_opt(common_arg(
1077        {"--verbose-prompt"},
1078        string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
1079        [](common_params & params) {
1080            params.verbose_prompt = true;
1081        }
1082    ));
1083    add_opt(common_arg(
1084        {"--display-prompt"},
1085        {"--no-display-prompt"},
1086        string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
1087        [](common_params & params, bool value) {
1088            params.display_prompt = value;
1089        }
1090    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1091    add_opt(common_arg(
1092        {"-co", "--color"}, "[on|off|auto]",
1093        "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
1094        "'auto' enables colors when output is to a terminal",
1095        [](common_params & params, const std::string & value) {
1096            if (is_truthy(value)) {
1097                params.use_color = true;
1098            } else if (is_falsey(value)) {
1099                params.use_color = false;
1100            } else if (is_autoy(value)) {
1101                params.use_color = tty_can_use_colors();
1102            } else {
1103                throw std::invalid_argument(
1104                    string_format("error: unknown value for --color: '%s'\n", value.c_str()));
1105            }
1106        }
1107    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
1108    add_opt(common_arg(
1109        {"-t", "--threads"}, "N",
1110        string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
1111        [](common_params & params, int value) {
1112            params.cpuparams.n_threads = value;
1113            if (params.cpuparams.n_threads <= 0) {
1114                params.cpuparams.n_threads = std::thread::hardware_concurrency();
1115            }
1116        }
1117    ).set_env("LLAMA_ARG_THREADS"));
1118    add_opt(common_arg(
1119        {"-tb", "--threads-batch"}, "N",
1120        "number of threads to use during batch and prompt processing (default: same as --threads)",
1121        [](common_params & params, int value) {
1122            params.cpuparams_batch.n_threads = value;
1123            if (params.cpuparams_batch.n_threads <= 0) {
1124                params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
1125            }
1126        }
1127    ));
1128    add_opt(common_arg(
1129        {"-C", "--cpu-mask"}, "M",
1130        "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
1131        [](common_params & params, const std::string & mask) {
1132            params.cpuparams.mask_valid = true;
1133            if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
1134                throw std::invalid_argument("invalid cpumask");
1135            }
1136        }
1137    ));
1138    add_opt(common_arg(
1139        {"-Cr", "--cpu-range"}, "lo-hi",
1140        "range of CPUs for affinity. Complements --cpu-mask",
1141        [](common_params & params, const std::string & range) {
1142            params.cpuparams.mask_valid = true;
1143            if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
1144                throw std::invalid_argument("invalid range");
1145            }
1146        }
1147    ));
1148    add_opt(common_arg(
1149        {"--cpu-strict"}, "<0|1>",
1150        string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
1151        [](common_params & params, const std::string & value) {
1152            params.cpuparams.strict_cpu = std::stoul(value);
1153        }
1154    ));
1155    add_opt(common_arg(
1156        {"--prio"}, "N",
1157        string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
1158        [](common_params & params, int prio) {
1159            if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
1160                throw std::invalid_argument("invalid value");
1161            }
1162            params.cpuparams.priority = (enum ggml_sched_priority) prio;
1163        }
1164    ));
1165    add_opt(common_arg(
1166        {"--poll"}, "<0...100>",
1167        string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
1168        [](common_params & params, const std::string & value) {
1169            params.cpuparams.poll = std::stoul(value);
1170        }
1171    ));
1172    add_opt(common_arg(
1173        {"-Cb", "--cpu-mask-batch"}, "M",
1174        "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
1175        [](common_params & params, const std::string & mask) {
1176            params.cpuparams_batch.mask_valid = true;
1177            if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
1178                throw std::invalid_argument("invalid cpumask");
1179            }
1180        }
1181    ));
1182    add_opt(common_arg(
1183        {"-Crb", "--cpu-range-batch"}, "lo-hi",
1184        "ranges of CPUs for affinity. Complements --cpu-mask-batch",
1185        [](common_params & params, const std::string & range) {
1186            params.cpuparams_batch.mask_valid = true;
1187            if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
1188                throw std::invalid_argument("invalid range");
1189            }
1190        }
1191    ));
1192    add_opt(common_arg(
1193        {"--cpu-strict-batch"}, "<0|1>",
1194        "use strict CPU placement (default: same as --cpu-strict)",
1195        [](common_params & params, int value) {
1196            params.cpuparams_batch.strict_cpu = value;
1197        }
1198    ));
1199    add_opt(common_arg(
1200        {"--prio-batch"}, "N",
1201        string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
1202        [](common_params & params, int prio) {
1203            if (prio < 0 || prio > 3) {
1204                throw std::invalid_argument("invalid value");
1205            }
1206            params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
1207        }
1208    ));
1209    add_opt(common_arg(
1210        {"--poll-batch"}, "<0|1>",
1211        "use polling to wait for work (default: same as --poll)",
1212        [](common_params & params, int value) {
1213            params.cpuparams_batch.poll = value;
1214        }
1215    ));
1216    add_opt(common_arg(
1217        {"-lcs", "--lookup-cache-static"}, "FNAME",
1218        "path to static lookup cache to use for lookup decoding (not updated by generation)",
1219        [](common_params & params, const std::string & value) {
1220            params.speculative.lookup_cache_static = value;
1221        }
1222    ).set_examples({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
1223    add_opt(common_arg(
1224        {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
1225        "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
1226        [](common_params & params, const std::string & value) {
1227            params.speculative.lookup_cache_dynamic = value;
1228        }
1229    ).set_examples({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
1230    add_opt(common_arg(
1231        {"-c", "--ctx-size"}, "N",
1232        string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
1233        [](common_params & params, int value) {
1234            params.n_ctx = value;
1235            if (value == 0) {
1236                // disable context reduction in llama_params_fit if the user explicitly requests the full context size:
1237                params.fit_params_min_ctx = UINT32_MAX;
1238            }
1239        }
1240    ).set_env("LLAMA_ARG_CTX_SIZE"));
1241    add_opt(common_arg(
1242        {"-n", "--predict", "--n-predict"}, "N",
1243        string_format(
1244            ex == LLAMA_EXAMPLE_COMPLETION
1245                ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
1246                : "number of tokens to predict (default: %d, -1 = infinity)",
1247            params.n_predict),
1248        [](common_params & params, int value) {
1249            params.n_predict = value;
1250        }
1251    ).set_env("LLAMA_ARG_N_PREDICT"));
1252    add_opt(common_arg(
1253        {"-b", "--batch-size"}, "N",
1254        string_format("logical maximum batch size (default: %d)", params.n_batch),
1255        [](common_params & params, int value) {
1256            params.n_batch = value;
1257        }
1258    ).set_env("LLAMA_ARG_BATCH"));
1259    add_opt(common_arg(
1260        {"-ub", "--ubatch-size"}, "N",
1261        string_format("physical maximum batch size (default: %d)", params.n_ubatch),
1262        [](common_params & params, int value) {
1263            params.n_ubatch = value;
1264        }
1265    ).set_env("LLAMA_ARG_UBATCH"));
1266    add_opt(common_arg(
1267        {"--keep"}, "N",
1268        string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
1269        [](common_params & params, int value) {
1270            params.n_keep = value;
1271        }
1272    ));
1273    add_opt(common_arg(
1274        {"--swa-full"},
1275        string_format("use full-size SWA cache (default: %s)\n"
1276            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)", params.swa_full ? "true" : "false"),
1277        [](common_params & params) {
1278            params.swa_full = true;
1279        }
1280    ).set_env("LLAMA_ARG_SWA_FULL"));
1281    add_opt(common_arg(
1282        {"--ctx-checkpoints", "--swa-checkpoints"}, "N",
1283        string_format("max number of context checkpoints to create per slot (default: %d)"
1284            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)", params.n_ctx_checkpoints),
1285        [](common_params & params, int value) {
1286            params.n_ctx_checkpoints = value;
1287        }
1288    ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
1289    add_opt(common_arg(
1290        {"-cram", "--cache-ram"}, "N",
1291        string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
1292            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
1293        [](common_params & params, int value) {
1294            params.cache_ram_mib = value;
1295        }
1296    ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
1297    add_opt(common_arg(
1298        {"-kvu", "--kv-unified"},
1299        {"-no-kvu", "--no-kv-unified"},
1300        "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
1301        [](common_params & params, bool value) {
1302            params.kv_unified = value;
1303        }
1304    ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH}));
1305    add_opt(common_arg(
1306        {"--context-shift"},
1307        {"--no-context-shift"},
1308        string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
1309        [](common_params & params, bool value) {
1310            params.ctx_shift = value;
1311        }
1312    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
1313    add_opt(common_arg(
1314        {"--chunks"}, "N",
1315        string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
1316        [](common_params & params, int value) {
1317            params.n_chunks = value;
1318        }
1319    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1320    add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
1321                       string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
1322                                     llama_flash_attn_type_name(params.flash_attn_type)),
1323                       [](common_params & params, const std::string & value) {
1324                           if (is_truthy(value)) {
1325                               params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1326                           } else if (is_falsey(value)) {
1327                               params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1328                           } else if (is_autoy(value)) {
1329                               params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1330                           } else {
1331                               throw std::runtime_error(
1332                                   string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
1333                           }
1334                       }).set_env("LLAMA_ARG_FLASH_ATTN"));
1335    add_opt(common_arg(
1336        {"-p", "--prompt"}, "PROMPT",
1337        "prompt to start generation with; for system message, use -sys",
1338        [](common_params & params, const std::string & value) {
1339            params.prompt = value;
1340        }
1341    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
1342    add_opt(common_arg(
1343        {"-sys", "--system-prompt"}, "PROMPT",
1344        "system prompt to use with model (if applicable, depending on chat template)",
1345        [](common_params & params, const std::string & value) {
1346            params.system_prompt = value;
1347        }
1348    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
1349    add_opt(common_arg(
1350        {"--perf"},
1351        {"--no-perf"},
1352        string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
1353        [](common_params & params, bool value) {
1354            params.no_perf = !value;
1355            params.sampling.no_perf = !value;
1356        }
1357    ).set_env("LLAMA_ARG_PERF"));
1358    add_opt(common_arg(
1359        {"--show-timings"},
1360        {"--no-show-timings"},
1361        string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
1362        [](common_params & params, bool value) {
1363            params.show_timings = value;
1364        }
1365    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
1366    add_opt(common_arg(
1367        {"-f", "--file"}, "FNAME",
1368        "a file containing the prompt (default: none)",
1369        [](common_params & params, const std::string & value) {
1370            params.prompt = read_file(value);
1371            // store the external file name in params
1372            params.prompt_file = value;
1373            if (!params.prompt.empty() && params.prompt.back() == '\n') {
1374                params.prompt.pop_back();
1375            }
1376        }
1377    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
1378    add_opt(common_arg(
1379        {"-sysf", "--system-prompt-file"}, "FNAME",
1380        "a file containing the system prompt (default: none)",
1381        [](common_params & params, const std::string & value) {
1382            params.system_prompt = read_file(value);
1383            if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
1384                params.system_prompt.pop_back();
1385            }
1386        }
1387    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
1388    add_opt(common_arg(
1389        {"--in-file"}, "FNAME",
1390        "an input file (use comma-separated values to specify multiple files)",
1391        [](common_params & params, const std::string & value) {
1392            for (const auto & item : parse_csv_row(value)) {
1393                std::ifstream file(item);
1394                if (!file) {
1395                    throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
1396                }
1397                params.in_files.push_back(item);
1398            }
1399        }
1400    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1401    add_opt(common_arg(
1402        {"-bf", "--binary-file"}, "FNAME",
1403        "binary file containing the prompt (default: none)",
1404        [](common_params & params, const std::string & value) {
1405            std::ifstream file(value, std::ios::binary);
1406            if (!file) {
1407                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1408            }
1409            // store the external file name in params
1410            params.prompt_file = value;
1411            std::ostringstream ss;
1412            ss << file.rdbuf();
1413            params.prompt = ss.str();
1414            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
1415        }
1416    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
1417    add_opt(common_arg(
1418        {"-e", "--escape"},
1419        {"--no-escape"},
1420        string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
1421        [](common_params & params, bool value) {
1422            params.escape = value;
1423        }
1424    ));
1425    add_opt(common_arg(
1426        {"-ptc", "--print-token-count"}, "N",
1427        string_format("print token count every N tokens (default: %d)", params.n_print),
1428        [](common_params & params, int value) {
1429            params.n_print = value;
1430        }
1431    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1432    add_opt(common_arg(
1433        {"--prompt-cache"}, "FNAME",
1434        "file to cache prompt state for faster startup (default: none)",
1435        [](common_params & params, const std::string & value) {
1436            params.path_prompt_cache = value;
1437        }
1438    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1439    add_opt(common_arg(
1440        {"--prompt-cache-all"},
1441        "if specified, saves user input and generations to cache as well\n",
1442        [](common_params & params) {
1443            params.prompt_cache_all = true;
1444        }
1445    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1446    add_opt(common_arg(
1447        {"--prompt-cache-ro"},
1448        "if specified, uses the prompt cache but does not update it",
1449        [](common_params & params) {
1450            params.prompt_cache_ro = true;
1451        }
1452    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1453    add_opt(common_arg(
1454        {"-r", "--reverse-prompt"}, "PROMPT",
1455        "halt generation at PROMPT, return control in interactive mode\n",
1456        [](common_params & params, const std::string & value) {
1457            params.antiprompt.emplace_back(value);
1458        }
1459    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
1460    add_opt(common_arg(
1461        {"-sp", "--special"},
1462        string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
1463        [](common_params & params) {
1464            params.special = true;
1465        }
1466    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
1467    add_opt(common_arg(
1468        {"-cnv", "--conversation"},
1469        {"-no-cnv", "--no-conversation"},
1470        "whether to run in conversation mode:\n"
1471        "- does not print special tokens and suffix/prefix\n"
1472        "- interactive mode is also enabled\n"
1473        "(default: auto enabled if chat template is available)",
1474        [](common_params & params, bool value) {
1475            params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
1476        }
1477    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1478    add_opt(common_arg(
1479        {"-st", "--single-turn"},
1480        "run conversation for a single turn only, then exit when done\n"
1481        "will not be interactive if first turn is predefined with --prompt\n"
1482        "(default: false)",
1483        [](common_params & params) {
1484            params.single_turn = true;
1485        }
1486    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1487    add_opt(common_arg(
1488        {"-i", "--interactive"},
1489        string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
1490        [](common_params & params) {
1491            params.interactive = true;
1492        }
1493    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1494    add_opt(common_arg(
1495        {"-if", "--interactive-first"},
1496        string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
1497        [](common_params & params) {
1498            params.interactive_first = true;
1499        }
1500    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1501    add_opt(common_arg(
1502        {"-mli", "--multiline-input"},
1503        "allows you to write or paste multiple lines without ending each in '\\'",
1504        [](common_params & params) {
1505            params.multiline_input = true;
1506        }
1507    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
1508    add_opt(common_arg(
1509        {"--in-prefix-bos"},
1510        "prefix BOS to user inputs, preceding the `--in-prefix` string",
1511        [](common_params & params) {
1512            params.input_prefix_bos = true;
1513            params.enable_chat_template = false;
1514        }
1515    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1516    add_opt(common_arg(
1517        {"--in-prefix"}, "STRING",
1518        "string to prefix user inputs with (default: empty)",
1519        [](common_params & params, const std::string & value) {
1520            params.input_prefix = value;
1521            params.enable_chat_template = false;
1522        }
1523    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1524    add_opt(common_arg(
1525        {"--in-suffix"}, "STRING",
1526        "string to suffix after user inputs with (default: empty)",
1527        [](common_params & params, const std::string & value) {
1528            params.input_suffix = value;
1529            params.enable_chat_template = false;
1530        }
1531    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
1532    add_opt(common_arg(
1533        {"--warmup"},
1534        {"--no-warmup"},
1535        string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
1536        [](common_params & params, bool value) {
1537            params.warmup = value;
1538        }
1539    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
1540    add_opt(common_arg(
1541        {"--spm-infill"},
1542        string_format(
1543            "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
1544            params.spm_infill ? "enabled" : "disabled"
1545        ),
1546        [](common_params & params) {
1547            params.spm_infill = true;
1548        }
1549    ).set_examples({LLAMA_EXAMPLE_SERVER}));
1550    add_opt(common_arg(
1551        {"--samplers"}, "SAMPLERS",
1552        string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
1553        [](common_params & params, const std::string & value) {
1554            const auto sampler_names = string_split<std::string>(value, ';');
1555            params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
1556            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS;
1557        }
1558    ).set_sparam());
1559    add_opt(common_arg(
1560        {"-s", "--seed"}, "SEED",
1561        string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
1562        [](common_params & params, const std::string & value) {
1563            params.sampling.seed = std::stoul(value);
1564        }
1565    ).set_sparam());
1566    add_opt(common_arg(
1567        {"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
1568        string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
1569        [](common_params & params, const std::string & value) {
1570            params.sampling.samplers = common_sampler_types_from_chars(value);
1571        }
1572    ).set_sparam());
1573    add_opt(common_arg(
1574        {"--ignore-eos"},
1575        "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
1576        [](common_params & params) {
1577            params.sampling.ignore_eos = true;
1578        }
1579    ).set_sparam());
1580    add_opt(common_arg(
1581        {"--temp"}, "N",
1582        string_format("temperature (default: %.2f)", (double)params.sampling.temp),
1583        [](common_params & params, const std::string & value) {
1584            params.sampling.temp = std::stof(value);
1585            params.sampling.temp = std::max(params.sampling.temp, 0.0f);
1586            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP;
1587        }
1588    ).set_sparam());
1589    add_opt(common_arg(
1590        {"--top-k"}, "N",
1591        string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
1592        [](common_params & params, int value) {
1593            params.sampling.top_k = value;
1594            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
1595        }
1596    ).set_sparam().set_env("LLAMA_ARG_TOP_K"));
1597    add_opt(common_arg(
1598        {"--top-p"}, "N",
1599        string_format("top-p sampling (default: %.2f, 1.0 = disabled)", (double)params.sampling.top_p),
1600        [](common_params & params, const std::string & value) {
1601            params.sampling.top_p = std::stof(value);
1602            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P;
1603        }
1604    ).set_sparam());
1605    add_opt(common_arg(
1606        {"--min-p"}, "N",
1607        string_format("min-p sampling (default: %.2f, 0.0 = disabled)", (double)params.sampling.min_p),
1608        [](common_params & params, const std::string & value) {
1609            params.sampling.min_p = std::stof(value);
1610            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P;
1611        }
1612    ).set_sparam());
1613    add_opt(common_arg(
1614        {"--top-nsigma"}, "N",
1615        string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma),
1616        [](common_params & params, const std::string & value) {
1617            params.sampling.top_n_sigma = std::stof(value);
1618        }
1619    ).set_sparam());
1620    add_opt(common_arg(
1621        {"--xtc-probability"}, "N",
1622        string_format("xtc probability (default: %.2f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
1623        [](common_params & params, const std::string & value) {
1624            params.sampling.xtc_probability = std::stof(value);
1625            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY;
1626        }
1627    ).set_sparam());
1628    add_opt(common_arg(
1629        {"--xtc-threshold"}, "N",
1630        string_format("xtc threshold (default: %.2f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
1631        [](common_params & params, const std::string & value) {
1632            params.sampling.xtc_threshold = std::stof(value);
1633            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD;
1634        }
1635    ).set_sparam());
1636    add_opt(common_arg(
1637        {"--typical"}, "N",
1638        string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p),
1639        [](common_params & params, const std::string & value) {
1640            params.sampling.typ_p = std::stof(value);
1641        }
1642    ).set_sparam());
1643    add_opt(common_arg(
1644        {"--repeat-last-n"}, "N",
1645        string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
1646        [](common_params & params, int value) {
1647            if (value < -1) {
1648                throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
1649            }
1650            params.sampling.penalty_last_n = value;
1651            params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
1652            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N;
1653        }
1654    ).set_sparam());
1655    add_opt(common_arg(
1656        {"--repeat-penalty"}, "N",
1657        string_format("penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
1658        [](common_params & params, const std::string & value) {
1659            params.sampling.penalty_repeat = std::stof(value);
1660            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT;
1661        }
1662    ).set_sparam());
1663    add_opt(common_arg(
1664        {"--presence-penalty"}, "N",
1665        string_format("repeat alpha presence penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_present),
1666        [](common_params & params, const std::string & value) {
1667            params.sampling.penalty_present = std::stof(value);
1668        }
1669    ).set_sparam());
1670    add_opt(common_arg(
1671        {"--frequency-penalty"}, "N",
1672        string_format("repeat alpha frequency penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
1673        [](common_params & params, const std::string & value) {
1674            params.sampling.penalty_freq = std::stof(value);
1675        }
1676    ).set_sparam());
1677    add_opt(common_arg(
1678        {"--dry-multiplier"}, "N",
1679        string_format("set DRY sampling multiplier (default: %.2f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
1680        [](common_params & params, const std::string & value) {
1681            params.sampling.dry_multiplier = std::stof(value);
1682        }
1683    ).set_sparam());
1684    add_opt(common_arg(
1685        {"--dry-base"}, "N",
1686        string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
1687        [](common_params & params, const std::string & value) {
1688            float potential_base = std::stof(value);
1689            if (potential_base >= 1.0f)
1690            {
1691                params.sampling.dry_base = potential_base;
1692            }
1693        }
1694    ).set_sparam());
1695    add_opt(common_arg(
1696        {"--dry-allowed-length"}, "N",
1697        string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
1698        [](common_params & params, int value) {
1699            params.sampling.dry_allowed_length = value;
1700        }
1701    ).set_sparam());
1702    add_opt(common_arg(
1703        {"--dry-penalty-last-n"}, "N",
1704        string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
1705        [](common_params & params, int value) {
1706            if (value < -1) {
1707                throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
1708            }
1709            params.sampling.dry_penalty_last_n = value;
1710        }
1711    ).set_sparam());
1712    add_opt(common_arg(
1713        {"--dry-sequence-breaker"}, "STRING",
1714        string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
1715            params.sampling.dry_sequence_breakers.empty() ? "none" :
1716            std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
1717                params.sampling.dry_sequence_breakers.end(),
1718                std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
1719                [](const std::string& a, const std::string& b) {
1720                    std::string formatted_b = (b == "\n") ? "\\n" : b;
1721                    return a + ", '" + formatted_b + "'";
1722                }).c_str()),
1723        [](common_params & params, const std::string & value) {
1724            static bool defaults_cleared = false;
1725
1726            if (!defaults_cleared) {
1727                params.sampling.dry_sequence_breakers.clear();
1728                defaults_cleared = true;
1729            }
1730
1731            if (value == "none") {
1732                params.sampling.dry_sequence_breakers.clear();
1733            } else {
1734                params.sampling.dry_sequence_breakers.emplace_back(value);
1735            }
1736        }
1737    ).set_sparam());
1738    add_opt(common_arg(
1739        {"--adaptive-target"}, "N",
1740        string_format("adaptive-p: select tokens near this probability (valid range 0.0 "
1741                      "to 1.0; negative = disabled) (default: %.2f)\n"
1742                      "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)",
1743                      (double)params.sampling.adaptive_target),
1744        [](common_params & params, const std::string & value) {
1745            params.sampling.adaptive_target = std::stof(value);
1746        }
1747    ).set_sparam());
1748    add_opt(common_arg(
1749        {"--adaptive-decay"}, "N",
1750        string_format("adaptive-p: decay rate for target adaptation over time. lower values "
1751                      "are more reactive, higher values are more stable.\n"
1752                      "(valid range 0.0 to 0.99) (default: %.2f)",
1753                      (double)params.sampling.adaptive_decay),
1754        [](common_params & params, const std::string & value) {
1755            params.sampling.adaptive_decay = std::stof(value);
1756        }
1757    ).set_sparam());
1758    add_opt(common_arg(
1759        {"--dynatemp-range"}, "N",
1760        string_format("dynamic temperature range (default: %.2f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
1761        [](common_params & params, const std::string & value) {
1762            params.sampling.dynatemp_range = std::stof(value);
1763        }
1764    ).set_sparam());
1765    add_opt(common_arg(
1766        {"--dynatemp-exp"}, "N",
1767        string_format("dynamic temperature exponent (default: %.2f)", (double)params.sampling.dynatemp_exponent),
1768        [](common_params & params, const std::string & value) {
1769            params.sampling.dynatemp_exponent = std::stof(value);
1770        }
1771    ).set_sparam());
1772    add_opt(common_arg(
1773        {"--mirostat"}, "N",
1774        string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
1775        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
1776        [](common_params & params, int value) {
1777            params.sampling.mirostat = value;
1778            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT;
1779        }
1780    ).set_sparam());
1781    add_opt(common_arg(
1782        {"--mirostat-lr"}, "N",
1783        string_format("Mirostat learning rate, parameter eta (default: %.2f)", (double)params.sampling.mirostat_eta),
1784        [](common_params & params, const std::string & value) {
1785            params.sampling.mirostat_eta = std::stof(value);
1786            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA;
1787        }
1788    ).set_sparam());
1789    add_opt(common_arg(
1790        {"--mirostat-ent"}, "N",
1791        string_format("Mirostat target entropy, parameter tau (default: %.2f)", (double)params.sampling.mirostat_tau),
1792        [](common_params & params, const std::string & value) {
1793            params.sampling.mirostat_tau = std::stof(value);
1794            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU;
1795        }
1796    ).set_sparam());
1797    add_opt(common_arg(
1798        {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
1799        "modifies the likelihood of token appearing in the completion,\n"
1800        "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
1801        "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
1802        [](common_params & params, const std::string & value) {
1803            std::stringstream ss(value);
1804            llama_token key;
1805            char sign;
1806            std::string value_str;
1807            try {
1808                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
1809                    const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
1810                    params.sampling.logit_bias.push_back({key, bias});
1811                } else {
1812                    throw std::invalid_argument("invalid input format");
1813                }
1814            } catch (const std::exception&) {
1815                throw std::invalid_argument("invalid input format");
1816            }
1817        }
1818    ).set_sparam());
1819    add_opt(common_arg(
1820        {"--grammar"}, "GRAMMAR",
1821        string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
1822        [](common_params & params, const std::string & value) {
1823            params.sampling.grammar = value;
1824        }
1825    ).set_sparam());
1826    add_opt(common_arg(
1827        {"--grammar-file"}, "FNAME",
1828        "file to read grammar from",
1829        [](common_params & params, const std::string & value) {
1830            params.sampling.grammar = read_file(value);
1831        }
1832    ).set_sparam());
1833    add_opt(common_arg(
1834        {"-j", "--json-schema"}, "SCHEMA",
1835        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1836        [](common_params & params, const std::string & value) {
1837            params.sampling.grammar = json_schema_to_grammar(json::parse(value));
1838        }
1839    ).set_sparam());
1840    add_opt(common_arg(
1841        {"-jf", "--json-schema-file"}, "FILE",
1842        "File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
1843        [](common_params & params, const std::string & value) {
1844            std::ifstream file(value);
1845            if (!file) {
1846                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1847            }
1848            std::string schema;
1849            std::copy(
1850                std::istreambuf_iterator<char>(file),
1851                std::istreambuf_iterator<char>(),
1852                std::back_inserter(schema)
1853            );
1854            params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
1855        }
1856    ).set_sparam());
1857    add_opt(common_arg(
1858        {"-bs", "--backend-sampling"},
1859        "enable backend sampling (experimental) (default: disabled)",
1860        [](common_params & params) {
1861            params.sampling.backend_sampling = true;
1862        }
1863    ).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
1864    add_opt(common_arg(
1865        {"--pooling"}, "{none,mean,cls,last,rank}",
1866        "pooling type for embeddings, use model default if unspecified",
1867        [](common_params & params, const std::string & value) {
1868            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
1869            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
1870            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS;  }
1871            else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
1872            else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
1873            else { throw std::invalid_argument("invalid value"); }
1874        }
1875    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
1876    add_opt(common_arg(
1877        {"--attention"}, "{causal,non-causal}",
1878        "attention type for embeddings, use model default if unspecified",
1879        [](common_params & params, const std::string & value) {
1880            /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
1881            else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
1882            else { throw std::invalid_argument("invalid value"); }
1883        }
1884    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
1885    add_opt(common_arg(
1886        {"--rope-scaling"}, "{none,linear,yarn}",
1887        "RoPE frequency scaling method, defaults to linear unless specified by the model",
1888        [](common_params & params, const std::string & value) {
1889            /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
1890            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
1891            else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
1892            else { throw std::invalid_argument("invalid value"); }
1893        }
1894    ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
1895    add_opt(common_arg(
1896        {"--rope-scale"}, "N",
1897        "RoPE context scaling factor, expands context by a factor of N",
1898        [](common_params & params, const std::string & value) {
1899            params.rope_freq_scale = 1.0f / std::stof(value);
1900        }
1901    ).set_env("LLAMA_ARG_ROPE_SCALE"));
1902    add_opt(common_arg(
1903        {"--rope-freq-base"}, "N",
1904        "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
1905        [](common_params & params, const std::string & value) {
1906            params.rope_freq_base = std::stof(value);
1907        }
1908    ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
1909    add_opt(common_arg(
1910        {"--rope-freq-scale"}, "N",
1911        "RoPE frequency scaling factor, expands context by a factor of 1/N",
1912        [](common_params & params, const std::string & value) {
1913            params.rope_freq_scale = std::stof(value);
1914        }
1915    ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
1916    add_opt(common_arg(
1917        {"--yarn-orig-ctx"}, "N",
1918        string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
1919        [](common_params & params, int value) {
1920            params.yarn_orig_ctx = value;
1921        }
1922    ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
1923    add_opt(common_arg(
1924        {"--yarn-ext-factor"}, "N",
1925        string_format("YaRN: extrapolation mix factor (default: %.2f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
1926        [](common_params & params, const std::string & value) {
1927            params.yarn_ext_factor = std::stof(value);
1928        }
1929    ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
1930    add_opt(common_arg(
1931        {"--yarn-attn-factor"}, "N",
1932        string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.2f)", (double)params.yarn_attn_factor),
1933        [](common_params & params, const std::string & value) {
1934            params.yarn_attn_factor = std::stof(value);
1935        }
1936    ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
1937    add_opt(common_arg(
1938        {"--yarn-beta-slow"}, "N",
1939        string_format("YaRN: high correction dim or alpha (default: %.2f)", (double)params.yarn_beta_slow),
1940        [](common_params & params, const std::string & value) {
1941            params.yarn_beta_slow = std::stof(value);
1942        }
1943    ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
1944    add_opt(common_arg(
1945        {"--yarn-beta-fast"}, "N",
1946        string_format("YaRN: low correction dim or beta (default: %.2f)", (double)params.yarn_beta_fast),
1947        [](common_params & params, const std::string & value) {
1948            params.yarn_beta_fast = std::stof(value);
1949        }
1950    ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
1951    add_opt(common_arg(
1952        {"-gan", "--grp-attn-n"}, "N",
1953        string_format("group-attention factor (default: %d)", params.grp_attn_n),
1954        [](common_params & params, int value) {
1955            params.grp_attn_n = value;
1956        }
1957    ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
1958    add_opt(common_arg(
1959        {"-gaw", "--grp-attn-w"}, "N",
1960        string_format("group-attention width (default: %d)", params.grp_attn_w),
1961        [](common_params & params, int value) {
1962            params.grp_attn_w = value;
1963        }
1964    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
1965    add_opt(common_arg(
1966        {"-kvo", "--kv-offload"},
1967        {"-nkvo", "--no-kv-offload"},
1968        string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
1969        [](common_params & params, bool value) {
1970            params.no_kv_offload = !value;
1971        }
1972    ).set_env("LLAMA_ARG_KV_OFFLOAD"));
1973    add_opt(common_arg(
1974        {"--repack"},
1975        {"-nr", "--no-repack"},
1976        string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
1977        [](common_params & params, bool value) {
1978            params.no_extra_bufts = !value;
1979        }
1980    ).set_env("LLAMA_ARG_REPACK"));
1981    add_opt(common_arg(
1982        {"--no-host"},
1983        "bypass host buffer allowing extra buffers to be used",
1984        [](common_params & params) {
1985            params.no_host = true;
1986        }
1987    ).set_env("LLAMA_ARG_NO_HOST"));
1988    add_opt(common_arg(
1989        {"-ctk", "--cache-type-k"}, "TYPE",
1990        string_format(
1991            "KV cache data type for K\n"
1992            "allowed values: %s\n"
1993            "(default: %s)",
1994            get_all_kv_cache_types().c_str(),
1995            ggml_type_name(params.cache_type_k)
1996        ),
1997        [](common_params & params, const std::string & value) {
1998            params.cache_type_k = kv_cache_type_from_str(value);
1999        }
2000    ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
2001    add_opt(common_arg(
2002        {"-ctv", "--cache-type-v"}, "TYPE",
2003        string_format(
2004            "KV cache data type for V\n"
2005            "allowed values: %s\n"
2006            "(default: %s)",
2007            get_all_kv_cache_types().c_str(),
2008            ggml_type_name(params.cache_type_v)
2009        ),
2010        [](common_params & params, const std::string & value) {
2011            params.cache_type_v = kv_cache_type_from_str(value);
2012        }
2013    ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
2014    add_opt(common_arg(
2015        {"--hellaswag"},
2016        "compute HellaSwag score over random tasks from datafile supplied with -f",
2017        [](common_params & params) {
2018            params.hellaswag = true;
2019        }
2020    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2021    add_opt(common_arg(
2022        {"--hellaswag-tasks"}, "N",
2023        string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
2024        [](common_params & params, int value) {
2025            params.hellaswag_tasks = value;
2026        }
2027    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2028    add_opt(common_arg(
2029        {"--winogrande"},
2030        "compute Winogrande score over random tasks from datafile supplied with -f",
2031        [](common_params & params) {
2032            params.winogrande = true;
2033        }
2034    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2035    add_opt(common_arg(
2036        {"--winogrande-tasks"}, "N",
2037        string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
2038        [](common_params & params, int value) {
2039            params.winogrande_tasks = value;
2040        }
2041    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2042    add_opt(common_arg(
2043        {"--multiple-choice"},
2044        "compute multiple choice score over random tasks from datafile supplied with -f",
2045        [](common_params & params) {
2046            params.multiple_choice = true;
2047        }
2048    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2049    add_opt(common_arg(
2050        {"--multiple-choice-tasks"}, "N",
2051        string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
2052        [](common_params & params, int value) {
2053            params.multiple_choice_tasks = value;
2054        }
2055    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2056    add_opt(common_arg(
2057        {"--kl-divergence"},
2058        "computes KL-divergence to logits provided via --kl-divergence-base",
2059        [](common_params & params) {
2060            params.kl_divergence = true;
2061        }
2062    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2063    add_opt(common_arg(
2064        {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
2065        "set logits file",
2066        [](common_params & params, const std::string & value) {
2067            params.logits_file = value;
2068        }
2069    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2070    add_opt(common_arg(
2071        {"--ppl-stride"}, "N",
2072        string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
2073        [](common_params & params, int value) {
2074            params.ppl_stride = value;
2075        }
2076    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2077    add_opt(common_arg(
2078        {"--ppl-output-type"}, "<0|1>",
2079        string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
2080        [](common_params & params, int value) {
2081            params.ppl_output_type = value;
2082        }
2083    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
2084    add_opt(common_arg(
2085        {"-dt", "--defrag-thold"}, "N",
2086        string_format("KV cache defragmentation threshold (DEPRECATED)"),
2087        [](common_params & params, const std::string & value) {
2088            GGML_UNUSED(params);
2089            GGML_UNUSED(value);
2090            LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
2091        }
2092    ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
2093    if (ex == LLAMA_EXAMPLE_SERVER) {
2094        // this is to make sure this option appears in the server-specific section of the help message
2095        add_opt(common_arg(
2096            {"-np", "--parallel"}, "N",
2097            string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel),
2098            [](common_params & params, int value) {
2099                if (value == 0) {
2100                    throw std::invalid_argument("error: invalid value for n_parallel\n");
2101                }
2102                params.n_parallel = value;
2103            }
2104        ).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER}));
2105    } else {
2106        add_opt(common_arg(
2107            {"-np", "--parallel"}, "N",
2108            string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
2109            [](common_params & params, int value) {
2110                params.n_parallel = value;
2111            }
2112        ).set_env("LLAMA_ARG_N_PARALLEL"));
2113    }
2114    add_opt(common_arg(
2115        {"-ns", "--sequences"}, "N",
2116        string_format("number of sequences to decode (default: %d)", params.n_sequences),
2117        [](common_params & params, int value) {
2118            params.n_sequences = value;
2119        }
2120    ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
2121    add_opt(common_arg(
2122        {"-cb", "--cont-batching"},
2123        {"-nocb", "--no-cont-batching"},
2124        string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
2125        [](common_params & params, bool value) {
2126            params.cont_batching = value;
2127        }
2128    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
2129    add_opt(common_arg(
2130        {"-mm", "--mmproj"}, "FILE",
2131        "path to a multimodal projector file. see tools/mtmd/README.md\n"
2132        "note: if -hf is used, this argument can be omitted",
2133        [](common_params & params, const std::string & value) {
2134            params.mmproj.path = value;
2135        }
2136    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
2137    add_opt(common_arg(
2138        {"-mmu", "--mmproj-url"}, "URL",
2139        "URL to a multimodal projector file. see tools/mtmd/README.md",
2140        [](common_params & params, const std::string & value) {
2141            params.mmproj.url = value;
2142        }
2143    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
2144    add_opt(common_arg(
2145        {"--mmproj-auto"},
2146        {"--no-mmproj", "--no-mmproj-auto"},
2147        string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
2148        [](common_params & params, bool value) {
2149            params.no_mmproj = !value;
2150        }
2151    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
2152    add_opt(common_arg(
2153        {"--mmproj-offload"},
2154        {"--no-mmproj-offload"},
2155        string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
2156        [](common_params & params, bool value) {
2157            params.mmproj_use_gpu = value;
2158        }
2159    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
2160    add_opt(common_arg(
2161        {"--image", "--audio"}, "FILE",
2162        "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
2163        [](common_params & params, const std::string & value) {
2164            for (const auto & item : parse_csv_row(value)) {
2165                params.image.emplace_back(item);
2166            }
2167        }
2168    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
2169    add_opt(common_arg(
2170        {"--image-min-tokens"}, "N",
2171        "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
2172        [](common_params & params, int value) {
2173            params.image_min_tokens = value;
2174        }
2175    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
2176    add_opt(common_arg(
2177        {"--image-max-tokens"}, "N",
2178        "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
2179        [](common_params & params, int value) {
2180            params.image_max_tokens = value;
2181        }
2182    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
2183    if (llama_supports_rpc()) {
2184        add_opt(common_arg(
2185            {"--rpc"}, "SERVERS",
2186            "comma separated list of RPC servers (host:port)",
2187            [](common_params & params, const std::string & value) {
2188                add_rpc_devices(value);
2189                GGML_UNUSED(params);
2190            }
2191        ).set_env("LLAMA_ARG_RPC"));
2192    }
2193    add_opt(common_arg(
2194        {"--mlock"},
2195        "force system to keep model in RAM rather than swapping or compressing",
2196        [](common_params & params) {
2197            params.use_mlock = true;
2198        }
2199    ).set_env("LLAMA_ARG_MLOCK"));
2200    add_opt(common_arg(
2201        {"--mmap"},
2202        {"--no-mmap"},
2203        string_format("whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
2204        [](common_params & params, bool value) {
2205            params.use_mmap = value;
2206        }
2207    ).set_env("LLAMA_ARG_MMAP"));
2208    add_opt(common_arg(
2209        {"-dio", "--direct-io"},
2210        {"-ndio", "--no-direct-io"},
2211        string_format("use DirectIO if available. (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
2212        [](common_params & params, bool value) {
2213            params.use_direct_io = value;
2214        }
2215    ).set_env("LLAMA_ARG_DIO"));
2216    add_opt(common_arg(
2217        {"--numa"}, "TYPE",
2218        "attempt optimizations that help on some NUMA systems\n"
2219        "- distribute: spread execution evenly over all nodes\n"
2220        "- isolate: only spawn threads on CPUs on the node that execution started on\n"
2221        "- numactl: use the CPU map provided by numactl\n"
2222        "if run without this previously, it is recommended to drop the system page cache before using this\n"
2223        "see https://github.com/ggml-org/llama.cpp/issues/1437",
2224        [](common_params & params, const std::string & value) {
2225            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
2226            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
2227            else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
2228            else { throw std::invalid_argument("invalid value"); }
2229        }
2230    ).set_env("LLAMA_ARG_NUMA"));
2231    add_opt(common_arg(
2232        {"-dev", "--device"}, "<dev1,dev2,..>",
2233        "comma-separated list of devices to use for offloading (none = don't offload)\n"
2234        "use --list-devices to see a list of available devices",
2235        [](common_params & params, const std::string & value) {
2236            params.devices = parse_device_list(value);
2237        }
2238    ).set_env("LLAMA_ARG_DEVICE"));
2239    add_opt(common_arg(
2240        {"--list-devices"},
2241        "print list of available devices and exit",
2242        [](common_params &) {
2243            std::vector<ggml_backend_dev_t> devices;
2244            for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
2245                auto * dev = ggml_backend_dev_get(i);
2246                if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
2247                    devices.push_back(dev);
2248                }
2249            }
2250            printf("Available devices:\n");
2251            for (auto * dev : devices) {
2252                size_t free, total;
2253                ggml_backend_dev_memory(dev, &free, &total);
2254                printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
2255            }
2256            exit(0);
2257        }
2258    ));
2259    add_opt(common_arg(
2260        {"-ot", "--override-tensor"}, "<tensor name pattern>=<buffer type>,...",
2261        "override tensor buffer type", [](common_params & params, const std::string & value) {
2262            parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
2263        }
2264    ).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
2265    add_opt(common_arg(
2266        {"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
2267        "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
2268            parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
2269        }
2270    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
2271    add_opt(common_arg(
2272        {"-cmoe", "--cpu-moe"},
2273        "keep all Mixture of Experts (MoE) weights in the CPU",
2274        [](common_params & params) {
2275            params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2276        }
2277    ).set_env("LLAMA_ARG_CPU_MOE"));
2278    add_opt(common_arg(
2279        {"-ncmoe", "--n-cpu-moe"}, "N",
2280        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2281        [](common_params & params, int value) {
2282            if (value < 0) {
2283                throw std::invalid_argument("invalid value");
2284            }
2285            for (int i = 0; i < value; ++i) {
2286                // keep strings alive and avoid leaking memory by storing them in a static vector
2287                static std::list<std::string> buft_overrides;
2288                buft_overrides.push_back(llm_ffn_exps_block_regex(i));
2289                params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2290            }
2291        }
2292    ).set_env("LLAMA_ARG_N_CPU_MOE"));
2293    add_opt(common_arg(
2294        {"-cmoed", "--cpu-moe-draft"},
2295        "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
2296        [](common_params & params) {
2297            params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2298        }
2299    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2300    add_opt(common_arg(
2301        {"-ncmoed", "--n-cpu-moe-draft"}, "N",
2302        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
2303        [](common_params & params, int value) {
2304            if (value < 0) {
2305                throw std::invalid_argument("invalid value");
2306            }
2307            for (int i = 0; i < value; ++i) {
2308                static std::list<std::string> buft_overrides_draft;
2309                buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
2310                params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
2311            }
2312        }
2313    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
2314    GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
2315    add_opt(common_arg(
2316        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2317        string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
2318        [](common_params & params, const std::string & value) {
2319            if (value == "auto") {
2320                params.n_gpu_layers = -1;
2321            } else if (value == "all") {
2322                params.n_gpu_layers = -2;
2323            } else {
2324                params.n_gpu_layers = std::stoi(value);
2325            }
2326            if (!llama_supports_gpu_offload()) {
2327                fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
2328                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
2329                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
2330            }
2331        }
2332    ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
2333    add_opt(common_arg(
2334        {"-sm", "--split-mode"}, "{none,layer,row}",
2335        "how to split the model across multiple GPUs, one of:\n"
2336        "- none: use one GPU only\n"
2337        "- layer (default): split layers and KV across GPUs\n"
2338        "- row: split rows across GPUs",
2339        [](common_params & params, const std::string & value) {
2340            std::string arg_next = value;
2341            if (arg_next == "none") {
2342                params.split_mode = LLAMA_SPLIT_MODE_NONE;
2343            } else if (arg_next == "layer") {
2344                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
2345            } else if (arg_next == "row") {
2346                params.split_mode = LLAMA_SPLIT_MODE_ROW;
2347            } else {
2348                throw std::invalid_argument("invalid value");
2349            }
2350            if (!llama_supports_gpu_offload()) {
2351                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
2352            }
2353        }
2354    ).set_env("LLAMA_ARG_SPLIT_MODE"));
2355    add_opt(common_arg(
2356        {"-ts", "--tensor-split"}, "N0,N1,N2,...",
2357        "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
2358        [](common_params & params, const std::string & value) {
2359            std::string arg_next = value;
2360
2361            // split string by , and /
2362            const std::regex regex{ R"([,/]+)" };
2363            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
2364            std::vector<std::string> split_arg{ it, {} };
2365            if (split_arg.size() >= llama_max_devices()) {
2366                throw std::invalid_argument(
2367                    string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
2368                );
2369            }
2370            for (size_t i = 0; i < llama_max_devices(); ++i) {
2371                if (i < split_arg.size()) {
2372                    params.tensor_split[i] = std::stof(split_arg[i]);
2373                } else {
2374                    params.tensor_split[i] = 0.0f;
2375                }
2376            }
2377            if (!llama_supports_gpu_offload()) {
2378                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
2379            }
2380        }
2381    ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
2382    add_opt(common_arg(
2383        {"-mg", "--main-gpu"}, "INDEX",
2384        string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
2385        [](common_params & params, int value) {
2386            params.main_gpu = value;
2387            if (!llama_supports_gpu_offload()) {
2388                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
2389            }
2390        }
2391    ).set_env("LLAMA_ARG_MAIN_GPU"));
2392    add_opt(common_arg(
2393        { "-fit", "--fit" }, "[on|off]",
2394        string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"),
2395        [](common_params & params, const std::string & value) {
2396            if (is_truthy(value)) {
2397                params.fit_params = true;
2398            } else if (is_falsey(value)) {
2399                params.fit_params = false;
2400            } else {
2401                throw std::runtime_error(
2402                    string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
2403            }
2404        }
2405    ).set_env("LLAMA_ARG_FIT"));
2406    add_opt(common_arg(
2407        { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
2408        string_format("target margin per device for --fit, comma-separated list of values, "
2409            "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
2410        [](common_params & params, const std::string & value) {
2411            std::string arg_next = value;
2412
2413            // split string by , and /
2414            const std::regex regex{ R"([,/]+)" };
2415            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
2416            std::vector<std::string> split_arg{ it, {} };
2417            if (split_arg.size() >= llama_max_devices()) {
2418                throw std::invalid_argument(
2419                    string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
2420                );
2421            }
2422            if (split_arg.size() == 1) {
2423                std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
2424                return;
2425            }
2426            for (size_t i = 0; i < split_arg.size(); i++) {
2427                params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
2428            }
2429        }
2430    ).set_env("LLAMA_ARG_FIT_TARGET"));
2431    add_opt(common_arg(
2432        { "-fitc", "--fit-ctx" }, "N",
2433        string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx),
2434        [](common_params & params, int value) {
2435            params.fit_params_min_ctx = value;
2436        }
2437    ).set_env("LLAMA_ARG_FIT_CTX"));
2438    add_opt(common_arg(
2439        {"--check-tensors"},
2440        string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
2441        [](common_params & params) {
2442            params.check_tensors = true;
2443        }
2444    ));
2445    add_opt(common_arg(
2446        {"--override-kv"}, "KEY=TYPE:VALUE,...",
2447        "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
2448        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
2449        [](common_params & params, const std::string & value) {
2450            for (const auto & item : parse_csv_row(value)) {
2451                if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
2452                    throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
2453                }
2454            }
2455        }
2456    ));
2457    add_opt(common_arg(
2458        {"--op-offload"},
2459        {"--no-op-offload"},
2460        string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
2461        [](common_params & params, bool value) {
2462            params.no_op_offload = !value;
2463        }
2464    ));
2465    add_opt(common_arg(
2466        {"--lora"}, "FNAME",
2467        "path to LoRA adapter (use comma-separated values to load multiple adapters)",
2468        [](common_params & params, const std::string & value) {
2469            for (const auto & item : parse_csv_row(value)) {
2470                params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
2471            }
2472        }
2473        // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2474    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2475    add_opt(common_arg(
2476        {"--lora-scaled"}, "FNAME:SCALE,...",
2477        "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
2478        "note: use comma-separated values",
2479        [](common_params & params, const std::string & value) {
2480            for (const auto & item : parse_csv_row(value)) {
2481                auto parts = string_split<std::string>(item, ':');
2482                if (parts.size() != 2) {
2483                    throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
2484                }
2485                params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
2486            }
2487        }
2488        // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2489    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2490    add_opt(common_arg(
2491        {"--control-vector"}, "FNAME",
2492        "add a control vector\nnote: use comma-separated values to add multiple control vectors",
2493        [](common_params & params, const std::string & value) {
2494            for (const auto & item : parse_csv_row(value)) {
2495                params.control_vectors.push_back({ 1.0f, item, });
2496            }
2497        }
2498    ));
2499    add_opt(common_arg(
2500        {"--control-vector-scaled"}, "FNAME:SCALE,...",
2501        "add a control vector with user defined scaling SCALE\n"
2502        "note: use comma-separated values (format: FNAME:SCALE,...)",
2503        [](common_params & params, const std::string & value) {
2504            for (const auto & item : parse_csv_row(value)) {
2505                auto parts = string_split<std::string>(item, ':');
2506                if (parts.size() != 2) {
2507                    throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
2508                }
2509                params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
2510            }
2511        }
2512    ));
2513    add_opt(common_arg(
2514        {"--control-vector-layer-range"}, "START", "END",
2515        "layer range to apply the control vector(s) to, start and end inclusive",
2516        [](common_params & params, const std::string & start, const std::string & end) {
2517            params.control_vector_layer_start = std::stoi(start);
2518            params.control_vector_layer_end = std::stoi(end);
2519        }
2520    ));
2521    add_opt(common_arg(
2522        {"-a", "--alias"}, "STRING",
2523        "set alias for model name (to be used by REST API)",
2524        [](common_params & params, const std::string & value) {
2525            params.model_alias = value;
2526        }
2527    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
2528    add_opt(common_arg(
2529        {"-m", "--model"}, "FNAME",
2530        ex == LLAMA_EXAMPLE_EXPORT_LORA
2531            ? "model path from which to load base model"
2532            : "model path to load",
2533        [](common_params & params, const std::string & value) {
2534            params.model.path = value;
2535        }
2536    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
2537    add_opt(common_arg(
2538        {"-mu", "--model-url"}, "MODEL_URL",
2539        "model download url (default: unused)",
2540        [](common_params & params, const std::string & value) {
2541            params.model.url = value;
2542        }
2543    ).set_env("LLAMA_ARG_MODEL_URL"));
2544    add_opt(common_arg(
2545        { "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
2546        "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
2547        "example: gemma3\n"
2548        "(default: unused)",
2549        [](common_params & params, const std::string & value) {
2550            params.model.docker_repo = value;
2551        }
2552    ).set_env("LLAMA_ARG_DOCKER_REPO"));
2553    add_opt(common_arg(
2554        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
2555        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
2556        "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
2557        "example: unsloth/phi-4-GGUF:q4_k_m\n"
2558        "(default: unused)",
2559        [](common_params & params, const std::string & value) {
2560            params.model.hf_repo = value;
2561        }
2562    ).set_env("LLAMA_ARG_HF_REPO"));
2563    add_opt(common_arg(
2564        {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
2565        "Same as --hf-repo, but for the draft model (default: unused)",
2566        [](common_params & params, const std::string & value) {
2567            params.speculative.mparams_dft.hf_repo = value;
2568        }
2569    ).set_env("LLAMA_ARG_HFD_REPO"));
2570    add_opt(common_arg(
2571        {"-hff", "--hf-file"}, "FILE",
2572        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
2573        [](common_params & params, const std::string & value) {
2574            params.model.hf_file = value;
2575        }
2576    ).set_env("LLAMA_ARG_HF_FILE"));
2577    add_opt(common_arg(
2578        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
2579        "Hugging Face model repository for the vocoder model (default: unused)",
2580        [](common_params & params, const std::string & value) {
2581            params.vocoder.model.hf_repo = value;
2582        }
2583    ).set_env("LLAMA_ARG_HF_REPO_V"));
2584    add_opt(common_arg(
2585        {"-hffv", "--hf-file-v"}, "FILE",
2586        "Hugging Face model file for the vocoder model (default: unused)",
2587        [](common_params & params, const std::string & value) {
2588            params.vocoder.model.hf_file = value;
2589        }
2590    ).set_env("LLAMA_ARG_HF_FILE_V"));
2591    add_opt(common_arg(
2592        {"-hft", "--hf-token"}, "TOKEN",
2593        "Hugging Face access token (default: value from HF_TOKEN environment variable)",
2594        [](common_params & params, const std::string & value) {
2595            params.hf_token = value;
2596        }
2597    ).set_env("HF_TOKEN"));
2598    add_opt(common_arg(
2599        {"--context-file"}, "FNAME",
2600        "file to load context from (use comma-separated values to specify multiple files)",
2601        [](common_params & params, const std::string & value) {
2602            for (const auto & item : parse_csv_row(value)) {
2603                std::ifstream file(item, std::ios::binary);
2604                if (!file) {
2605                    throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
2606                }
2607                params.context_files.push_back(item);
2608            }
2609        }
2610    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2611    add_opt(common_arg(
2612        {"--chunk-size"}, "N",
2613        string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
2614        [](common_params & params, int value) {
2615            params.chunk_size = value;
2616        }
2617    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2618    add_opt(common_arg(
2619        {"--chunk-separator"}, "STRING",
2620        string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
2621        [](common_params & params, const std::string & value) {
2622            params.chunk_separator = value;
2623        }
2624    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2625    add_opt(common_arg(
2626        {"--junk"}, "N",
2627        string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
2628        [](common_params & params, int value) {
2629            params.n_junk = value;
2630        }
2631    ).set_examples({LLAMA_EXAMPLE_PASSKEY, LLAMA_EXAMPLE_PARALLEL}));
2632    add_opt(common_arg(
2633        {"--pos"}, "N",
2634        string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
2635        [](common_params & params, int value) {
2636            params.i_pos = value;
2637        }
2638    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
2639    add_opt(common_arg(
2640        {"-o", "--output", "--output-file"}, "FNAME",
2641        string_format("output file (default: '%s')", params.out_file.c_str()),
2642        [](common_params & params, const std::string & value) {
2643            params.out_file = value;
2644        }
2645    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
2646    add_opt(common_arg(
2647        {"-ofreq", "--output-frequency"}, "N",
2648        string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
2649        [](common_params & params, int value) {
2650            params.n_out_freq = value;
2651        }
2652    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2653    add_opt(common_arg(
2654        {"--output-format"}, "{gguf,dat}",
2655        string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
2656        [](common_params & params, const std::string & value) {
2657            /**/ if (value == "gguf") { params.imat_dat = -1; }
2658            else if (value == "dat")  { params.imat_dat = 1;  }
2659            else { throw std::invalid_argument("invalid output format"); }
2660        }
2661    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2662    add_opt(common_arg(
2663        {"--save-frequency"}, "N",
2664        string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
2665        [](common_params & params, int value) {
2666            params.n_save_freq = value;
2667        }
2668    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2669    add_opt(common_arg(
2670        {"--process-output"},
2671        string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
2672        [](common_params & params) {
2673            params.process_output = true;
2674        }
2675    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2676    add_opt(common_arg(
2677        {"--ppl"},
2678        {"--no-ppl"},
2679        string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
2680        [](common_params & params, bool value) {
2681            params.compute_ppl = value;
2682        }
2683    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2684    add_opt(common_arg(
2685        {"--chunk", "--from-chunk"}, "N",
2686        string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
2687        [](common_params & params, int value) {
2688            params.i_chunk = value;
2689        }
2690    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2691    add_opt(common_arg(
2692        {"--show-statistics"},
2693        string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
2694        [](common_params & params) {
2695            params.show_statistics = true;
2696        }
2697    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2698    add_opt(common_arg(
2699        {"--parse-special"},
2700        string_format("parse special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
2701        [](common_params & params) {
2702            params.parse_special = true;
2703        }
2704    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2705    add_opt(common_arg(
2706        {"-pps"},
2707        string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
2708        [](common_params & params) {
2709            params.is_pp_shared = true;
2710        }
2711    ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2712    add_opt(common_arg(
2713        {"-tgs"},
2714        string_format("is the text generation separated across the different sequences (default: %s)", params.is_tg_separate ? "true" : "false"),
2715        [](common_params & params) {
2716            params.is_tg_separate = true;
2717        }
2718    ).set_examples({LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
2719    add_opt(common_arg(
2720        {"-npp"}, "n0,n1,...",
2721        "number of prompt tokens",
2722        [](common_params & params, const std::string & value) {
2723            auto p = string_split<int>(value, ',');
2724            params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
2725        }
2726    ).set_examples({LLAMA_EXAMPLE_BENCH}));
2727    add_opt(common_arg(
2728        {"-ntg"}, "n0,n1,...",
2729        "number of text generation tokens",
2730        [](common_params & params, const std::string & value) {
2731            auto p = string_split<int>(value, ',');
2732            params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
2733        }
2734    ).set_examples({LLAMA_EXAMPLE_BENCH}));
2735    add_opt(common_arg(
2736        {"-npl"}, "n0,n1,...",
2737        "number of parallel prompts",
2738        [](common_params & params, const std::string & value) {
2739            auto p = string_split<int>(value, ',');
2740            params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
2741        }
2742    ).set_examples({LLAMA_EXAMPLE_BENCH}));
2743    add_opt(common_arg(
2744        {"--embd-normalize"}, "N",
2745        string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
2746        [](common_params & params, int value) {
2747            params.embd_normalize = value;
2748        }
2749    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
2750    add_opt(common_arg(
2751        {"--embd-output-format"}, "FORMAT",
2752        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
2753        [](common_params & params, const std::string & value) {
2754            params.embd_out = value;
2755        }
2756    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2757    add_opt(common_arg(
2758        {"--embd-separator"}, "STRING",
2759        "separator of embeddings (default \\n) for example \"<#sep#>\"",
2760        [](common_params & params, const std::string & value) {
2761            params.embd_sep = value;
2762        }
2763    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2764    add_opt(common_arg(
2765        {"--cls-separator"}, "STRING",
2766        "separator of classification sequences (default \\t) for example \"<#seq#>\"",
2767        [](common_params & params, const std::string & value) {
2768            params.cls_sep = value;
2769        }
2770    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2771    add_opt(common_arg(
2772        {"--host"}, "HOST",
2773        string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
2774        [](common_params & params, const std::string & value) {
2775            params.hostname = value;
2776        }
2777    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
2778    add_opt(common_arg(
2779        {"--port"}, "PORT",
2780        string_format("port to listen (default: %d)", params.port),
2781        [](common_params & params, int value) {
2782            params.port = value;
2783        }
2784    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
2785    add_opt(common_arg(
2786        {"--path"}, "PATH",
2787        string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
2788        [](common_params & params, const std::string & value) {
2789            params.public_path = value;
2790        }
2791    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2792    add_opt(common_arg(
2793        {"--api-prefix"}, "PREFIX",
2794        string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2795        [](common_params & params, const std::string & value) {
2796            params.api_prefix = value;
2797        }
2798    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2799    add_opt(common_arg(
2800        {"--webui-config"}, "JSON",
2801        "JSON that provides default WebUI settings (overrides WebUI defaults)",
2802        [](common_params & params, const std::string & value) {
2803            params.webui_config_json = value;
2804        }
2805    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
2806    add_opt(common_arg(
2807        {"--webui-config-file"}, "PATH",
2808        "JSON file that provides default WebUI settings (overrides WebUI defaults)",
2809        [](common_params & params, const std::string & value) {
2810            params.webui_config_json = read_file(value);
2811        }
2812    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
2813    add_opt(common_arg(
2814        {"--webui"},
2815        {"--no-webui"},
2816        string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
2817        [](common_params & params, bool value) {
2818            params.webui = value;
2819        }
2820    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
2821    add_opt(common_arg(
2822        {"--embedding", "--embeddings"},
2823        string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
2824        [](common_params & params) {
2825            params.embedding = true;
2826        }
2827    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
2828    add_opt(common_arg(
2829        {"--rerank", "--reranking"},
2830        string_format("enable reranking endpoint on server (default: %s)", "disabled"),
2831        [](common_params & params) {
2832            params.embedding = true;
2833            params.pooling_type = LLAMA_POOLING_TYPE_RANK;
2834        }
2835    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
2836    add_opt(common_arg(
2837        {"--api-key"}, "KEY",
2838        "API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
2839        [](common_params & params, const std::string & value) {
2840            for (const auto & key : parse_csv_row(value)) {
2841                if (!key.empty()) {
2842                    params.api_keys.push_back(key);
2843                }
2844            }
2845        }
2846    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
2847    add_opt(common_arg(
2848        {"--api-key-file"}, "FNAME",
2849        "path to file containing API keys (default: none)",
2850        [](common_params & params, const std::string & value) {
2851            std::ifstream key_file(value);
2852            if (!key_file) {
2853                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
2854            }
2855            std::string key;
2856            while (std::getline(key_file, key)) {
2857                if (!key.empty()) {
2858                    params.api_keys.push_back(key);
2859                }
2860            }
2861            key_file.close();
2862        }
2863    ).set_examples({LLAMA_EXAMPLE_SERVER}));
2864    add_opt(common_arg(
2865        {"--ssl-key-file"}, "FNAME",
2866        "path to file a PEM-encoded SSL private key",
2867        [](common_params & params, const std::string & value) {
2868            params.ssl_file_key = value;
2869        }
2870    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
2871    add_opt(common_arg(
2872        {"--ssl-cert-file"}, "FNAME",
2873        "path to file a PEM-encoded SSL certificate",
2874        [](common_params & params, const std::string & value) {
2875            params.ssl_file_cert = value;
2876        }
2877    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
2878    add_opt(common_arg(
2879        {"--chat-template-kwargs"}, "STRING",
2880        "sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
2881        [](common_params & params, const std::string & value) {
2882            auto parsed = json::parse(value);
2883            for (const auto & item : parsed.items()) {
2884                params.default_template_kwargs[item.key()] = item.value().dump();
2885            }
2886        }
2887    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
2888    add_opt(common_arg(
2889        {"-to", "--timeout"}, "N",
2890        string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
2891        [](common_params & params, int value) {
2892            params.timeout_read  = value;
2893            params.timeout_write = value;
2894        }
2895    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
2896    add_opt(common_arg(
2897        {"--threads-http"}, "N",
2898        string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
2899        [](common_params & params, int value) {
2900            params.n_threads_http = value;
2901        }
2902    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
2903    add_opt(common_arg(
2904        {"--cache-prompt"},
2905        {"--no-cache-prompt"},
2906        string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
2907        [](common_params & params, bool value) {
2908            params.cache_prompt = value;
2909        }
2910    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
2911    add_opt(common_arg(
2912        {"--cache-reuse"}, "N",
2913        string_format(
2914            "min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
2915            "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
2916        ),
2917        [](common_params & params, int value) {
2918            params.n_cache_reuse = value;
2919        }
2920    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
2921    add_opt(common_arg(
2922        {"--metrics"},
2923        string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
2924        [](common_params & params) {
2925            params.endpoint_metrics = true;
2926        }
2927    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
2928    add_opt(common_arg(
2929        {"--props"},
2930        string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
2931        [](common_params & params) {
2932            params.endpoint_props = true;
2933        }
2934    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
2935    add_opt(common_arg(
2936        {"--slots"},
2937        {"--no-slots"},
2938        string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
2939        [](common_params & params, bool value) {
2940            params.endpoint_slots = value;
2941        }
2942    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
2943    add_opt(common_arg(
2944        {"--slot-save-path"}, "PATH",
2945        "path to save slot kv cache (default: disabled)",
2946        [](common_params & params, const std::string & value) {
2947            params.slot_save_path = value;
2948            if (!fs_is_directory(params.slot_save_path)) {
2949                throw std::invalid_argument("not a directory: " + value);
2950            }
2951            // if doesn't end with DIRECTORY_SEPARATOR, add it
2952            if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
2953                params.slot_save_path += DIRECTORY_SEPARATOR;
2954            }
2955        }
2956    ).set_examples({LLAMA_EXAMPLE_SERVER}));
2957    add_opt(common_arg(
2958        {"--media-path"}, "PATH",
2959        "directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled)",
2960        [](common_params & params, const std::string & value) {
2961            params.media_path = value;
2962            if (!fs_is_directory(params.media_path)) {
2963                throw std::invalid_argument("not a directory: " + value);
2964            }
2965            // if doesn't end with DIRECTORY_SEPARATOR, add it
2966            if (!params.media_path.empty() && params.media_path[params.media_path.size() - 1] != DIRECTORY_SEPARATOR) {
2967                params.media_path += DIRECTORY_SEPARATOR;
2968            }
2969        }
2970    ).set_examples({LLAMA_EXAMPLE_SERVER}));
2971    add_opt(common_arg(
2972        {"--models-dir"}, "PATH",
2973        "directory containing models for the router server (default: disabled)",
2974        [](common_params & params, const std::string & value) {
2975            params.models_dir = value;
2976        }
2977    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
2978    add_opt(common_arg(
2979        {"--models-preset"}, "PATH",
2980        "path to INI file containing model presets for the router server (default: disabled)",
2981        [](common_params & params, const std::string & value) {
2982            params.models_preset = value;
2983        }
2984    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
2985    add_opt(common_arg(
2986        {"--models-max"}, "N",
2987        string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
2988        [](common_params & params, int value) {
2989            params.models_max = value;
2990        }
2991    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
2992    add_opt(common_arg(
2993        {"--models-autoload"},
2994        {"--no-models-autoload"},
2995        string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
2996        [](common_params & params, bool value) {
2997            params.models_autoload = value;
2998        }
2999    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
3000    add_opt(common_arg(
3001        {"--jinja"},
3002        {"--no-jinja"},
3003        string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
3004        [](common_params & params, bool value) {
3005            params.use_jinja = value;
3006        }
3007    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
3008    add_opt(common_arg(
3009        {"--reasoning-format"}, "FORMAT",
3010        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
3011        "- none: leaves thoughts unparsed in `message.content`\n"
3012        "- deepseek: puts thoughts in `message.reasoning_content`\n"
3013        "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
3014        "(default: auto)",
3015        [](common_params & params, const std::string & value) {
3016            params.reasoning_format = common_reasoning_format_from_name(value);
3017        }
3018    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
3019    add_opt(common_arg(
3020        {"--reasoning-budget"}, "N",
3021        "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
3022        [](common_params & params, int value) {
3023            if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
3024            params.reasoning_budget = value;
3025        }
3026    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
3027    add_opt(common_arg(
3028        {"--chat-template"}, "JINJA_TEMPLATE",
3029        string_format(
3030            "set custom jinja chat template (default: template taken from model's metadata)\n"
3031            "if suffix/prefix are specified, template will be disabled\n"
3032            "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
3033            "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
3034        ),
3035        [](common_params & params, const std::string & value) {
3036            params.chat_template = value;
3037        }
3038    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
3039    add_opt(common_arg(
3040        {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
3041        string_format(
3042            "set custom jinja chat template file (default: template taken from model's metadata)\n"
3043            "if suffix/prefix are specified, template will be disabled\n"
3044            "only commonly used templates are accepted (unless --jinja is set before this flag):\n"
3045            "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
3046        ),
3047        [](common_params & params, const std::string & value) {
3048            params.chat_template = read_file(value);
3049        }
3050    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
3051    add_opt(common_arg(
3052        {"--prefill-assistant"},
3053        {"--no-prefill-assistant"},
3054        string_format(
3055            "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
3056            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
3057        ),
3058        [](common_params & params, bool value) {
3059            params.prefill_assistant = value;
3060        }
3061    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
3062    add_opt(common_arg(
3063        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
3064        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
3065        [](common_params & params, const std::string & value) {
3066            params.slot_prompt_similarity = std::stof(value);
3067        }
3068    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3069    add_opt(common_arg(
3070        {"--lora-init-without-apply"},
3071        string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
3072        [](common_params & params) {
3073            params.lora_init_without_apply = true;
3074        }
3075    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3076    add_opt(common_arg(
3077        {"--sleep-idle-seconds"}, "SECONDS",
3078        string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
3079        [](common_params & params, int value) {
3080            if (value == 0 || value < -1) {
3081                throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
3082            }
3083            params.sleep_idle_seconds = value;
3084        }
3085    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3086    add_opt(common_arg(
3087        {"--simple-io"},
3088        "use basic IO for better compatibility in subprocesses and limited consoles",
3089        [](common_params & params) {
3090            params.simple_io = true;
3091        }
3092    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
3093    add_opt(common_arg(
3094        {"--positive-file"}, "FNAME",
3095        string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
3096        [](common_params & params, const std::string & value) {
3097            params.cvector_positive_file = value;
3098        }
3099    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
3100    add_opt(common_arg(
3101        {"--negative-file"}, "FNAME",
3102        string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
3103        [](common_params & params, const std::string & value) {
3104            params.cvector_negative_file = value;
3105        }
3106    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
3107    add_opt(common_arg(
3108        {"--pca-batch"}, "N",
3109        string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
3110        [](common_params & params, int value) {
3111            params.n_pca_batch = value;
3112        }
3113    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
3114    add_opt(common_arg(
3115        {"--pca-iter"}, "N",
3116        string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
3117        [](common_params & params, int value) {
3118            params.n_pca_iterations = value;
3119        }
3120    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
3121    add_opt(common_arg(
3122        {"--method"}, "{pca, mean}",
3123        "dimensionality reduction method to be used (default: pca)",
3124        [](common_params & params, const std::string & value) {
3125            /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
3126            else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
3127            else { throw std::invalid_argument("invalid value"); }
3128        }
3129    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
3130    add_opt(common_arg(
3131        {"--output-format"}, "{md,jsonl}",
3132        "output format for batched-bench results (default: md)",
3133        [](common_params & params, const std::string & value) {
3134            /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
3135            else if (value == "md") { params.batched_bench_output_jsonl = false; }
3136            else { throw std::invalid_argument("invalid value"); }
3137        }
3138    ).set_examples({LLAMA_EXAMPLE_BENCH}));
3139    add_opt(common_arg(
3140        {"--log-disable"},
3141        "Log disable",
3142        [](common_params &) {
3143            common_log_pause(common_log_main());
3144        }
3145    ));
3146    add_opt(common_arg(
3147        {"--log-file"}, "FNAME",
3148        "Log to file",
3149        [](common_params &, const std::string & value) {
3150            common_log_set_file(common_log_main(), value.c_str());
3151        }
3152    ).set_env("LLAMA_LOG_FILE"));
3153    add_opt(common_arg(
3154        {"--log-colors"}, "[on|off|auto]",
3155        "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
3156        "'auto' enables colors when output is to a terminal",
3157        [](common_params &, const std::string & value) {
3158            if (is_truthy(value)) {
3159                common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
3160            } else if (is_falsey(value)) {
3161                common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
3162            } else if (is_autoy(value)) {
3163                common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
3164            } else {
3165                throw std::invalid_argument(
3166                    string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
3167            }
3168        }
3169    ).set_env("LLAMA_LOG_COLORS"));
3170    add_opt(common_arg(
3171        {"-v", "--verbose", "--log-verbose"},
3172        "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
3173        [](common_params & params) {
3174            params.verbosity = INT_MAX;
3175        }
3176    ));
3177    add_opt(common_arg(
3178        {"--offline"},
3179        "Offline mode: forces use of cache, prevents network access",
3180        [](common_params & params) {
3181            params.offline = true;
3182        }
3183    ).set_env("LLAMA_OFFLINE"));
3184    add_opt(common_arg(
3185        {"-lv", "--verbosity", "--log-verbosity"}, "N",
3186        string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
3187            " - 0: generic output\n"
3188            " - 1: error\n"
3189            " - 2: warning\n"
3190            " - 3: info\n"
3191            " - 4: debug\n"
3192            "(default: %d)\n", params.verbosity),
3193        [](common_params & params, int value) {
3194            params.verbosity = value;
3195        }
3196    ).set_env("LLAMA_LOG_VERBOSITY"));
3197    add_opt(common_arg(
3198        {"--log-prefix"},
3199        "Enable prefix in log messages",
3200        [](common_params &) {
3201            common_log_set_prefix(common_log_main(), true);
3202        }
3203    ).set_env("LLAMA_LOG_PREFIX"));
3204    add_opt(common_arg(
3205        {"--log-timestamps"},
3206        "Enable timestamps in log messages",
3207        [](common_params &) {
3208            common_log_set_timestamps(common_log_main(), true);
3209        }
3210    ).set_env("LLAMA_LOG_TIMESTAMPS"));
3211
3212    // speculative parameters
3213    add_opt(common_arg(
3214        {"-td", "--threads-draft"}, "N",
3215        "number of threads to use during generation (default: same as --threads)",
3216        [](common_params & params, int value) {
3217            params.speculative.cpuparams.n_threads = value;
3218            if (params.speculative.cpuparams.n_threads <= 0) {
3219                params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
3220            }
3221        }
3222    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3223    add_opt(common_arg(
3224        {"-tbd", "--threads-batch-draft"}, "N",
3225        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
3226        [](common_params & params, int value) {
3227            params.speculative.cpuparams_batch.n_threads = value;
3228            if (params.speculative.cpuparams_batch.n_threads <= 0) {
3229                params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
3230            }
3231        }
3232    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3233    add_opt(common_arg(
3234        {"-Cd", "--cpu-mask-draft"}, "M",
3235        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
3236        [](common_params & params, const std::string & mask) {
3237            params.speculative.cpuparams.mask_valid = true;
3238            if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
3239                throw std::invalid_argument("invalid cpumask");
3240            }
3241        }
3242    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3243    add_opt(common_arg(
3244        {"-Crd", "--cpu-range-draft"}, "lo-hi",
3245        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
3246        [](common_params & params, const std::string & range) {
3247            params.speculative.cpuparams.mask_valid = true;
3248            if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
3249                throw std::invalid_argument("invalid range");
3250            }
3251        }
3252    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3253    add_opt(common_arg(
3254        {"--cpu-strict-draft"}, "<0|1>",
3255        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
3256        [](common_params & params, int value) {
3257            params.speculative.cpuparams.strict_cpu = value;
3258        }
3259    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3260    add_opt(common_arg(
3261        {"--prio-draft"}, "N",
3262        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
3263        [](common_params & params, int prio) {
3264            if (prio < 0 || prio > 3) {
3265                throw std::invalid_argument("invalid value");
3266            }
3267            params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
3268        }
3269    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3270    add_opt(common_arg(
3271        {"--poll-draft"}, "<0|1>",
3272        "Use polling to wait for draft model work (default: same as --poll])",
3273        [](common_params & params, int value) {
3274            params.speculative.cpuparams.poll = value;
3275        }
3276    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3277    add_opt(common_arg(
3278        {"-Cbd", "--cpu-mask-batch-draft"}, "M",
3279        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
3280        [](common_params & params, const std::string & mask) {
3281            params.speculative.cpuparams_batch.mask_valid = true;
3282            if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
3283                throw std::invalid_argument("invalid cpumask");
3284            }
3285        }
3286    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3287    add_opt(common_arg(
3288        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
3289        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
3290        [](common_params & params, const std::string & range) {
3291            params.speculative.cpuparams_batch.mask_valid = true;
3292            if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
3293                throw std::invalid_argument("invalid cpumask");
3294            }
3295        }
3296    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3297    add_opt(common_arg(
3298        {"--cpu-strict-batch-draft"}, "<0|1>",
3299        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
3300        [](common_params & params, int value) {
3301            params.speculative.cpuparams_batch.strict_cpu = value;
3302        }
3303    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3304    add_opt(common_arg(
3305        {"--prio-batch-draft"}, "N",
3306        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
3307        [](common_params & params, int prio) {
3308            if (prio < 0 || prio > 3) {
3309                throw std::invalid_argument("invalid value");
3310            }
3311            params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
3312        }
3313    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3314    add_opt(common_arg(
3315        {"--poll-batch-draft"}, "<0|1>",
3316        "Use polling to wait for draft model work (default: --poll-draft)",
3317        [](common_params & params, int value) {
3318            params.speculative.cpuparams_batch.poll = value;
3319        }
3320    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3321    add_opt(common_arg(
3322        {"--draft", "--draft-n", "--draft-max"}, "N",
3323        string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
3324        [](common_params & params, int value) {
3325            params.speculative.n_max = value;
3326        }
3327    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
3328    add_opt(common_arg(
3329        {"--draft-min", "--draft-n-min"}, "N",
3330        string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
3331        [](common_params & params, int value) {
3332            params.speculative.n_min = value;
3333        }
3334    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
3335    add_opt(common_arg(
3336        {"--draft-p-split"}, "P",
3337        string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.p_split),
3338        [](common_params & params, const std::string & value) {
3339            params.speculative.p_split = std::stof(value);
3340        }
3341    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
3342    add_opt(common_arg(
3343        {"--draft-p-min"}, "P",
3344        string_format("minimum speculative decoding probability (greedy) (default: %.2f)", (double)params.speculative.p_min),
3345        [](common_params & params, const std::string & value) {
3346            params.speculative.p_min = std::stof(value);
3347        }
3348    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
3349    add_opt(common_arg(
3350        {"-cd", "--ctx-size-draft"}, "N",
3351        string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
3352        [](common_params & params, int value) {
3353            params.speculative.n_ctx = value;
3354        }
3355    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
3356    add_opt(common_arg(
3357        {"-devd", "--device-draft"}, "<dev1,dev2,..>",
3358        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
3359        "use --list-devices to see a list of available devices",
3360        [](common_params & params, const std::string & value) {
3361            params.speculative.devices = parse_device_list(value);
3362        }
3363    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3364    GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
3365    add_opt(common_arg(
3366        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
3367        string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
3368            params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
3369        [](common_params & params, const std::string & value) {
3370            if (value == "auto") {
3371                params.speculative.n_gpu_layers = -1;
3372            } else if (value == "all") {
3373                params.speculative.n_gpu_layers = -2;
3374            } else {
3375                params.speculative.n_gpu_layers = std::stoi(value);
3376            }
3377            if (!llama_supports_gpu_offload()) {
3378                fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
3379                fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
3380                fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
3381            }
3382        }
3383    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
3384    add_opt(common_arg(
3385        {"-md", "--model-draft"}, "FNAME",
3386        "draft model for speculative decoding (default: unused)",
3387        [](common_params & params, const std::string & value) {
3388            params.speculative.mparams_dft.path = value;
3389        }
3390    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3391    add_opt(common_arg(
3392        {"--spec-replace"}, "TARGET", "DRAFT",
3393        "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
3394        [](common_params & params, const std::string & tgt, const std::string & dft) {
3395            params.speculative.replacements.push_back({ tgt, dft });
3396        }
3397    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3398    add_opt(common_arg(
3399        {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
3400        string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
3401            common_speculative_type_to_str(params.speculative.type).c_str()),
3402        [](common_params & params, const std::string & value) {
3403            if (value == "none") {
3404                params.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
3405            } else if (value == "ngram-cache") {
3406                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
3407            } else if (value == "ngram-simple") {
3408                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
3409            } else if (value == "ngram-map-k") {
3410                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
3411            } else if (value == "ngram-map-k4v") {
3412                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
3413            } else if (value == "ngram-mod") {
3414                params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
3415            } else {
3416                throw std::invalid_argument("unknown speculative decoding type without draft model");
3417            }
3418        }
3419    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3420    add_opt(common_arg(
3421        {"--spec-ngram-size-n"}, "N",
3422        string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.ngram_size_n),
3423        [](common_params & params, int value) {
3424            if (value < 1 || value > 1024) {
3425                throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive");
3426            }
3427            params.speculative.ngram_size_n = value;
3428        }
3429    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3430    add_opt(common_arg(
3431        {"--spec-ngram-size-m"}, "N",
3432        string_format("ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.ngram_size_m),
3433        [](common_params & params, int value) {
3434            if (value < 1 || value > 1024) {
3435                throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive");
3436            }
3437            params.speculative.ngram_size_m = value;
3438        }
3439    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3440    add_opt(common_arg(
3441        {"--spec-ngram-min-hits"}, "N",
3442        string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits),
3443        [](common_params & params, int value) {
3444            if (value < 1) {
3445                throw std::invalid_argument("ngram min hits must be at least 1");
3446            }
3447            params.speculative.ngram_min_hits = value;
3448        }
3449    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3450    add_opt(common_arg(
3451        {"-ctkd", "--cache-type-k-draft"}, "TYPE",
3452        string_format(
3453            "KV cache data type for K for the draft model\n"
3454            "allowed values: %s\n"
3455            "(default: %s)",
3456            get_all_kv_cache_types().c_str(),
3457            ggml_type_name(params.speculative.cache_type_k)
3458        ),
3459        [](common_params & params, const std::string & value) {
3460            params.speculative.cache_type_k = kv_cache_type_from_str(value);
3461        }
3462    ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3463    add_opt(common_arg(
3464        {"-ctvd", "--cache-type-v-draft"}, "TYPE",
3465        string_format(
3466            "KV cache data type for V for the draft model\n"
3467            "allowed values: %s\n"
3468            "(default: %s)",
3469            get_all_kv_cache_types().c_str(),
3470            ggml_type_name(params.speculative.cache_type_v)
3471        ),
3472        [](common_params & params, const std::string & value) {
3473            params.speculative.cache_type_v = kv_cache_type_from_str(value);
3474        }
3475    ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
3476
3477    add_opt(common_arg(
3478        {"-mv", "--model-vocoder"}, "FNAME",
3479        "vocoder model for audio generation (default: unused)",
3480        [](common_params & params, const std::string & value) {
3481            params.vocoder.model.path = value;
3482        }
3483    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
3484     add_opt(common_arg(
3485        {"--tts-use-guide-tokens"},
3486        "Use guide tokens to improve TTS word recall",
3487        [](common_params & params) {
3488            params.vocoder.use_guide_tokens = true;
3489        }
3490    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
3491    add_opt(common_arg(
3492        {"--tts-speaker-file"}, "FNAME",
3493        "speaker file path for audio generation",
3494        [](common_params & params, const std::string & value) {
3495            params.vocoder.speaker_file = value;
3496        }
3497    ).set_examples({LLAMA_EXAMPLE_TTS}));
3498
3499    add_opt(common_arg(
3500        {"--diffusion-steps"}, "N",
3501        string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3502        [](common_params & params, int value) { params.diffusion.steps = value; }
3503    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3504    add_opt(common_arg(
3505        {"--diffusion-visual"},
3506        string_format("enable visual diffusion mode (show progressive generation) (default: %s)", params.diffusion.visual_mode ? "true" : "false"),
3507        [](common_params & params) { params.diffusion.visual_mode = true; }
3508    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3509    add_opt(common_arg(
3510        {"--diffusion-eps"}, "F",
3511        string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3512        [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3513    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3514    add_opt(common_arg(
3515        {"--diffusion-algorithm"}, "N",
3516        string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm),
3517        [](common_params & params, int value) { params.diffusion.algorithm = value; }
3518    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3519    add_opt(common_arg(
3520        {"--diffusion-alg-temp"}, "F",
3521        string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3522        [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3523    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3524    add_opt(common_arg(
3525        {"--diffusion-block-length"}, "N",
3526        string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3527        [](common_params & params, int value) { params.diffusion.block_length = value; }
3528    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3529    add_opt(common_arg(
3530        {"--diffusion-cfg-scale"}, "F",
3531        string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3532        [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
3533    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3534    add_opt(common_arg(
3535        {"--diffusion-add-gumbel-noise"}, "F",
3536        string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3537        [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3538    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3539    add_opt(common_arg(
3540        { "-lr", "--learning-rate" }, "ALPHA",
3541        string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
3542        [](common_params & params, const std::string & value) { params.lr.lr0 = std::stof(value); }
3543    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3544    add_opt(common_arg({ "-lr-min", "--learning-rate-min" }, "ALPHA",
3545        string_format("(if >0) final learning rate after decay (if -decay-epochs is set, default=%.2g)",
3546            (double) params.lr.lr_min),
3547        [](common_params & params, const std::string & value) { params.lr.lr_min = std::stof(value); }
3548    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3549    add_opt(common_arg(
3550        {"-decay-epochs", "--learning-rate-decay-epochs"}, "ALPHA",
3551        string_format("(if >0) decay learning rate to -lr-min after this many epochs (exponential decay, default=%.2g)", (double) params.lr.decay_epochs),
3552        [](common_params & params, const std::string & value) { params.lr.decay_epochs = std::stof(value); }
3553    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3554    add_opt(common_arg(
3555        {"-wd", "--weight-decay"}, "WD",
3556        string_format("adamw or sgd optimizer weight decay (0 is off; recommend very small e.g. 1e-9) (default: %.2g).", (double) params.lr.wd),
3557        [](common_params & params, const std::string & value) { params.lr.wd = std::stof(value); }
3558    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3559    add_opt(common_arg(
3560        {"-val-split", "--val-split"}, "FRACTION",
3561        string_format("fraction of data to use as validation set for training (default: %.2g).", (double) params.val_split),
3562        [](common_params & params, const std::string & value) { params.val_split = std::stof(value); }
3563    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3564    add_opt(common_arg(
3565        {"-epochs", "--epochs"}, "N",
3566        string_format("optimizer max # of epochs (default: %d)", params.lr.epochs),
3567        [](common_params & params, int epochs) { params.lr.epochs = epochs; }
3568    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3569    add_opt(common_arg(
3570        {"-opt", "--optimizer"}, "sgd|adamw", "adamw or sgd",
3571        [](common_params & params, const std::string & name) {
3572            params.optimizer = common_opt_get_optimizer(name.c_str());
3573            if (params.optimizer == GGML_OPT_OPTIMIZER_TYPE_COUNT) {
3574                throw std::invalid_argument("invalid --optimizer, valid options: adamw, sgd");
3575            }
3576        }
3577    ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3578    add_opt(common_arg(
3579        {"--save-logits"},
3580        string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
3581        [](common_params & params) {
3582            params.save_logits = true;
3583        }
3584    ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3585    add_opt(common_arg(
3586        {"--logits-output-dir"}, "PATH",
3587        string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
3588        [](common_params & params, const std::string & value) {
3589            params.logits_output_dir = value;
3590        }
3591    ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3592    add_opt(common_arg(
3593        {"--tensor-filter"}, "REGEX",
3594        "filter tensor names for debug output (regex pattern, can be specified multiple times)",
3595        [](common_params & params, const std::string & value) {
3596            params.tensor_filter.push_back(value);
3597        }
3598    ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3599
3600    // presets
3601    add_opt(common_arg(
3602        {"--tts-oute-default"},
3603        string_format("use default OuteTTS models (note: can download weights from the internet)"),
3604        [](common_params & params) {
3605            params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
3606            params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
3607            params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
3608            params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
3609        }
3610    ).set_examples({LLAMA_EXAMPLE_TTS}));
3611
3612    add_opt(common_arg(
3613        {"--embd-gemma-default"},
3614        string_format("use default EmbeddingGemma model (note: can download weights from the internet)"),
3615        [](common_params & params) {
3616            params.model.hf_repo = "ggml-org/embeddinggemma-300M-qat-q4_0-GGUF";
3617            params.model.hf_file = "embeddinggemma-300M-qat-Q4_0.gguf";
3618            params.port = 8011;
3619            params.n_ubatch = 2048;
3620            params.n_batch = 2048;
3621            params.n_parallel = 32;
3622            params.n_ctx = 2048*params.n_parallel;
3623            params.verbose_prompt = true;
3624            params.embedding = true;
3625        }
3626    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
3627
3628    add_opt(common_arg(
3629        {"--fim-qwen-1.5b-default"},
3630        string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
3631        [](common_params & params) {
3632            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
3633            params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
3634            params.port = 8012;
3635            params.n_ubatch = 1024;
3636            params.n_batch = 1024;
3637            params.n_ctx = 0;
3638            params.n_cache_reuse = 256;
3639        }
3640    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3641
3642    add_opt(common_arg(
3643        {"--fim-qwen-3b-default"},
3644        string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
3645        [](common_params & params) {
3646            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
3647            params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
3648            params.port = 8012;
3649            params.n_ubatch = 1024;
3650            params.n_batch = 1024;
3651            params.n_ctx = 0;
3652            params.n_cache_reuse = 256;
3653        }
3654    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3655
3656    add_opt(common_arg(
3657        {"--fim-qwen-7b-default"},
3658        string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
3659        [](common_params & params) {
3660            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3661            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3662            params.port = 8012;
3663            params.n_ubatch = 1024;
3664            params.n_batch = 1024;
3665            params.n_ctx = 0;
3666            params.n_cache_reuse = 256;
3667        }
3668    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3669
3670    add_opt(common_arg(
3671        {"--fim-qwen-7b-spec"},
3672        string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
3673        [](common_params & params) {
3674            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3675            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3676            params.speculative.mparams_dft.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3677            params.speculative.mparams_dft.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3678            params.port = 8012;
3679            params.n_ubatch = 1024;
3680            params.n_batch = 1024;
3681            params.n_ctx = 0;
3682            params.n_cache_reuse = 256;
3683        }
3684    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3685
3686    add_opt(common_arg(
3687        {"--fim-qwen-14b-spec"},
3688        string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
3689        [](common_params & params) {
3690            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
3691            params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
3692            params.speculative.mparams_dft.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3693            params.speculative.mparams_dft.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3694            params.port = 8012;
3695            params.n_ubatch = 1024;
3696            params.n_batch = 1024;
3697            params.n_ctx = 0;
3698            params.n_cache_reuse = 256;
3699        }
3700    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3701
3702    add_opt(common_arg(
3703        {"--fim-qwen-30b-default"},
3704        string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
3705        [](common_params & params) {
3706            params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
3707            params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
3708            params.port = 8012;
3709            params.n_ubatch = 1024;
3710            params.n_batch = 1024;
3711            params.n_ctx = 0;
3712            params.n_cache_reuse = 256;
3713        }
3714    ).set_examples({LLAMA_EXAMPLE_SERVER}));
3715
3716    add_opt(common_arg(
3717        {"--gpt-oss-20b-default"},
3718        string_format("use gpt-oss-20b (note: can download weights from the internet)"),
3719        [](common_params & params) {
3720            params.model.hf_repo = "ggml-org/gpt-oss-20b-GGUF";
3721            params.model.hf_file = "gpt-oss-20b-mxfp4.gguf";
3722            params.port = 8013;
3723            params.n_ubatch = 2048;
3724            params.n_batch = 32768;
3725            params.n_parallel = 2;
3726            params.n_ctx = 131072*params.n_parallel;
3727            params.sampling.temp = 1.0f;
3728            params.sampling.top_p = 1.0f;
3729            params.sampling.top_k = 0;
3730            params.sampling.min_p = 0.01f;
3731            params.use_jinja = true;
3732            //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3733        }
3734    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3735
3736    add_opt(common_arg(
3737        {"--gpt-oss-120b-default"},
3738        string_format("use gpt-oss-120b (note: can download weights from the internet)"),
3739        [](common_params & params) {
3740            params.model.hf_repo = "ggml-org/gpt-oss-120b-GGUF";
3741            params.port = 8013;
3742            params.n_ubatch = 2048;
3743            params.n_batch = 32768;
3744            params.n_parallel = 2;
3745            params.n_ctx = 131072*params.n_parallel;
3746            params.sampling.temp = 1.0f;
3747            params.sampling.top_p = 1.0f;
3748            params.sampling.top_k = 0;
3749            params.sampling.min_p = 0.01f;
3750            params.use_jinja = true;
3751            //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
3752        }
3753    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3754
3755    add_opt(common_arg(
3756        {"--vision-gemma-4b-default"},
3757        string_format("use Gemma 3 4B QAT (note: can download weights from the internet)"),
3758        [](common_params & params) {
3759            params.model.hf_repo = "ggml-org/gemma-3-4b-it-qat-GGUF";
3760            params.port = 8014;
3761            params.n_ctx = 0;
3762            params.use_jinja = true;
3763        }
3764    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3765
3766    add_opt(common_arg(
3767        {"--vision-gemma-12b-default"},
3768        string_format("use Gemma 3 12B QAT (note: can download weights from the internet)"),
3769        [](common_params & params) {
3770            params.model.hf_repo = "ggml-org/gemma-3-12b-it-qat-GGUF";
3771            params.port = 8014;
3772            params.n_ctx = 0;
3773            params.use_jinja = true;
3774        }
3775    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3776
3777    return ctx_arg;
3778}
3779
3780void common_params_add_preset_options(std::vector<common_arg> & args) {
3781    // arguments below won't be treated as CLI args, only preset options
3782    args.push_back(common_arg(
3783        {"load-on-startup"}, "NAME",
3784        "in server router mode, autoload this model on startup",
3785        [](common_params &, const std::string &) { /* unused */ }
3786    ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
3787
3788    args.push_back(common_arg(
3789        {"stop-timeout"}, "SECONDS",
3790        "in server router mode, force-kill model instance after this many seconds of graceful shutdown",
3791        [](common_params &, int) { /* unused */ }
3792    ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
3793
3794    // args.push_back(common_arg(
3795    //     {"pin"},
3796    //     "in server router mode, do not unload this model if models_max is exceeded",
3797    //     [](common_params &) { /* unused */ }
3798    // ).set_preset_only());
3799}