1#include <algorithm>
   2#include <array>
   3#include <cassert>
   4#include <chrono>
   5#include <cinttypes>
   6#include <clocale>
   7#include <cmath>
   8#include <cstdio>
   9#include <cstdlib>
  10#include <cstring>
  11#include <ctime>
  12#include <iterator>
  13#include <map>
  14#include <numeric>
  15#include <regex>
  16#include <sstream>
  17#include <string>
  18#include <thread>
  19#include <vector>
  20#include <unordered_set>
  21
  22#include "common.h"
  23#include "ggml.h"
  24#include "llama.h"
  25
  26#ifdef _WIN32
  27#    define WIN32_LEAN_AND_MEAN
  28#    ifndef NOMINMAX
  29#        define NOMINMAX
  30#    endif
  31#    include <windows.h>
  32#endif
  33
  34// utils
  35static uint64_t get_time_ns() {
  36    using clock = std::chrono::high_resolution_clock;
  37    return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
  38}
  39
  40static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
  41    if (a.pattern != b.pattern) {
  42        // cString comparison that may be null
  43        if (a.pattern == nullptr || b.pattern == nullptr) {
  44            return false;
  45        }
  46        if (strcmp(a.pattern, b.pattern) != 0) {
  47            return false;
  48        }
  49    }
  50    if (a.buft != b.buft) {
  51        return false;
  52    }
  53    return true;
  54}
  55
  56static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
  57    if (a.size() != b.size()) {
  58        return false;
  59    }
  60    for (size_t i = 0; i < a.size(); i++) {
  61        if (!tensor_buft_override_equal(a[i], b[i])) {
  62            return false;
  63        }
  64    }
  65    return true;
  66}
  67
  68static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
  69    if (a.size() != b.size()) {
  70        return false;
  71    }
  72    for (size_t i = 0; i < a.size(); i++) {
  73        if (!vec_tensor_buft_override_equal(a[i], b[i])) {
  74            return false;
  75        }
  76    }
  77    return true;
  78}
  79
  80template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
  81    std::ostringstream str;
  82    for (size_t i = 0; i < values.size(); i++) {
  83        str << values[i];
  84        if (i < values.size() - 1) {
  85            str << delim;
  86        }
  87    }
  88    return str.str();
  89}
  90
  91template <typename T, typename F> static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
  92    std::vector<std::string> str_values;
  93    std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
  94    return str_values;
  95}
  96
  97template <typename T> static T avg(const std::vector<T> & v) {
  98    if (v.empty()) {
  99        return 0;
 100    }
 101    T sum = std::accumulate(v.begin(), v.end(), T(0));
 102    return sum / (T) v.size();
 103}
 104
 105template <typename T> static T stdev(const std::vector<T> & v) {
 106    if (v.size() <= 1) {
 107        return 0;
 108    }
 109    T mean   = avg(v);
 110    T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
 111    T stdev  = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1));
 112    return stdev;
 113}
 114
 115static std::string get_cpu_info() {
 116    std::vector<std::string> cpu_list;
 117    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
 118        auto * dev      = ggml_backend_dev_get(i);
 119        auto   dev_type = ggml_backend_dev_type(dev);
 120        if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
 121            cpu_list.push_back(ggml_backend_dev_description(dev));
 122        }
 123    }
 124    return join(cpu_list, ", ");
 125}
 126
 127static std::string get_gpu_info() {
 128    std::vector<std::string> gpu_list;
 129    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
 130        auto * dev      = ggml_backend_dev_get(i);
 131        auto   dev_type = ggml_backend_dev_type(dev);
 132        if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU || dev_type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
 133            gpu_list.push_back(ggml_backend_dev_description(dev));
 134        }
 135    }
 136    return join(gpu_list, ", ");
 137}
 138
 139static std::vector<ggml_backend_dev_t> parse_devices_arg(const std::string & value) {
 140    std::vector<ggml_backend_dev_t> devices;
 141    std::string                     trimmed = string_strip(value);
 142    if (trimmed.empty()) {
 143        throw std::invalid_argument("no devices specified");
 144    }
 145    if (trimmed == "auto") {
 146        return devices;
 147    }
 148
 149    auto dev_names = string_split<std::string>(trimmed, '/');
 150    if (dev_names.size() == 1 && string_strip(dev_names[0]) == "none") {
 151        devices.push_back(nullptr);
 152        return devices;
 153    }
 154
 155    for (auto & name : dev_names) {
 156        std::string dev_name = string_strip(name);
 157        if (dev_name.empty()) {
 158            throw std::invalid_argument("invalid device specification");
 159        }
 160        auto * dev = ggml_backend_dev_by_name(dev_name.c_str());
 161        if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
 162            throw std::invalid_argument(string_format("invalid device: %s", dev_name.c_str()));
 163        }
 164        devices.push_back(dev);
 165    }
 166
 167    devices.push_back(nullptr);
 168    return devices;
 169}
 170
 171static void register_rpc_server_list(const std::string & servers) {
 172    auto rpc_servers = string_split<std::string>(servers, ',');
 173    if (rpc_servers.empty()) {
 174        throw std::invalid_argument("no RPC servers specified");
 175    }
 176
 177    auto * rpc_reg = ggml_backend_reg_by_name("RPC");
 178    if (!rpc_reg) {
 179        throw std::invalid_argument("failed to find RPC backend");
 180    }
 181
 182    using add_rpc_server_fn = ggml_backend_reg_t (*)(const char * endpoint);
 183    auto * ggml_backend_rpc_add_server_fn = (add_rpc_server_fn) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
 184    if (!ggml_backend_rpc_add_server_fn) {
 185        throw std::invalid_argument("failed to find RPC add server function");
 186    }
 187    for (const auto & server : rpc_servers) {
 188        auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
 189        ggml_backend_register(reg);
 190    }
 191}
 192
 193static std::string devices_to_string(const std::vector<ggml_backend_dev_t> & devices) {
 194    if (devices.empty()) {
 195        return "auto";
 196    }
 197
 198    if (devices.size() == 1 && devices[0] == nullptr) {
 199        return "none";
 200    }
 201
 202    std::vector<std::string> names;
 203    for (auto * dev : devices) {
 204        if (dev == nullptr) {
 205            break;
 206        }
 207        names.push_back(ggml_backend_dev_name(dev));
 208    }
 209
 210    return join(names, "/");
 211}
 212
 213// command line params
 214enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
 215
 216static const char * output_format_str(output_formats format) {
 217    switch (format) {
 218        case NONE:
 219            return "none";
 220        case CSV:
 221            return "csv";
 222        case JSON:
 223            return "json";
 224        case JSONL:
 225            return "jsonl";
 226        case MARKDOWN:
 227            return "md";
 228        case SQL:
 229            return "sql";
 230        default:
 231            GGML_ABORT("invalid output format");
 232    }
 233}
 234
 235static bool output_format_from_str(const std::string & s, output_formats & format) {
 236    if (s == "none") {
 237        format = NONE;
 238    } else if (s == "csv") {
 239        format = CSV;
 240    } else if (s == "json") {
 241        format = JSON;
 242    } else if (s == "jsonl") {
 243        format = JSONL;
 244    } else if (s == "md") {
 245        format = MARKDOWN;
 246    } else if (s == "sql") {
 247        format = SQL;
 248    } else {
 249        return false;
 250    }
 251    return true;
 252}
 253
 254static const char * split_mode_str(llama_split_mode mode) {
 255    switch (mode) {
 256        case LLAMA_SPLIT_MODE_NONE:
 257            return "none";
 258        case LLAMA_SPLIT_MODE_LAYER:
 259            return "layer";
 260        case LLAMA_SPLIT_MODE_ROW:
 261            return "row";
 262        default:
 263            GGML_ABORT("invalid split mode");
 264    }
 265}
 266
 267static std::string pair_str(const std::pair<int, int> & p) {
 268    static char buf[32];
 269    snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
 270    return buf;
 271}
 272
 273static std::vector<int> parse_int_range(const std::string & s) {
 274    // first[-last[(+|*)step]]
 275    std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
 276
 277    std::smatch match;
 278    std::string::const_iterator search_start(s.cbegin());
 279    std::vector<int> result;
 280    while (std::regex_search(search_start, s.cend(), match, range_regex)) {
 281        int  first = std::stoi(match[1]);
 282        int  last  = match[2].matched ? std::stoi(match[2]) : first;
 283        char op    = match[3].matched ? match[3].str()[0] : '+';
 284        int  step  = match[4].matched ? std::stoi(match[4]) : 1;
 285
 286        for (int i = first; i <= last;) {
 287            result.push_back(i);
 288
 289            int prev_i = i;
 290
 291            if (op == '+') {
 292                i += step;
 293            } else if (op == '*') {
 294                i *= step;
 295            } else {
 296                throw std::invalid_argument("invalid range format");
 297            }
 298
 299            if (i <= prev_i) {
 300                throw std::invalid_argument("invalid range");
 301            }
 302        }
 303        search_start = match.suffix().first;
 304    }
 305
 306    if (search_start != s.cend()) {
 307        throw std::invalid_argument("invalid range format");
 308    }
 309
 310    return result;
 311}
 312
 313struct cmd_params {
 314    std::vector<std::string>         model;
 315    std::vector<int>                 n_prompt;
 316    std::vector<int>                 n_gen;
 317    std::vector<std::pair<int, int>> n_pg;
 318    std::vector<int>                 n_depth;
 319    std::vector<int>                 n_batch;
 320    std::vector<int>                 n_ubatch;
 321    std::vector<ggml_type>           type_k;
 322    std::vector<ggml_type>           type_v;
 323    std::vector<int>                 n_threads;
 324    std::vector<std::string>         cpu_mask;
 325    std::vector<bool>                cpu_strict;
 326    std::vector<int>                 poll;
 327    std::vector<int>                 n_gpu_layers;
 328    std::vector<int>                 n_cpu_moe;
 329    std::vector<llama_split_mode>    split_mode;
 330    std::vector<int>                 main_gpu;
 331    std::vector<bool>                no_kv_offload;
 332    std::vector<bool>                flash_attn;
 333    std::vector<std::vector<ggml_backend_dev_t>> devices;
 334    std::vector<std::vector<float>>  tensor_split;
 335    std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
 336    std::vector<bool>                use_mmap;
 337    std::vector<bool>                use_direct_io;
 338    std::vector<bool>                embeddings;
 339    std::vector<bool>                no_op_offload;
 340    std::vector<bool>                no_host;
 341    ggml_numa_strategy               numa;
 342    int                              reps;
 343    ggml_sched_priority              prio;
 344    int                              delay;
 345    bool                             verbose;
 346    bool                             progress;
 347    bool                             no_warmup;
 348    output_formats                   output_format;
 349    output_formats                   output_format_stderr;
 350};
 351
 352static const cmd_params cmd_params_defaults = {
 353    /* model                */ { "models/7B/ggml-model-q4_0.gguf" },
 354    /* n_prompt             */ { 512 },
 355    /* n_gen                */ { 128 },
 356    /* n_pg                 */ {},
 357    /* n_depth              */ { 0 },
 358    /* n_batch              */ { 2048 },
 359    /* n_ubatch             */ { 512 },
 360    /* type_k               */ { GGML_TYPE_F16 },
 361    /* type_v               */ { GGML_TYPE_F16 },
 362    /* n_threads            */ { cpu_get_num_math() },
 363    /* cpu_mask             */ { "0x0" },
 364    /* cpu_strict           */ { false },
 365    /* poll                 */ { 50 },
 366    /* n_gpu_layers         */ { 99 },
 367    /* n_cpu_moe            */ { 0 },
 368    /* split_mode           */ { LLAMA_SPLIT_MODE_LAYER },
 369    /* main_gpu             */ { 0 },
 370    /* no_kv_offload        */ { false },
 371    /* flash_attn           */ { false },
 372    /* devices              */ { {} },
 373    /* tensor_split         */ { std::vector<float>(llama_max_devices(), 0.0f) },
 374    /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
 375    /* use_mmap             */ { false },
 376    /* use_direct_io        */ { false },
 377    /* embeddings           */ { false },
 378    /* no_op_offload        */ { false },
 379    /* no_host              */ { false },
 380    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
 381    /* reps                 */ 5,
 382    /* prio                 */ GGML_SCHED_PRIO_NORMAL,
 383    /* delay                */ 0,
 384    /* verbose              */ false,
 385    /* progress             */ false,
 386    /* no_warmup            */ false,
 387    /* output_format        */ MARKDOWN,
 388    /* output_format_stderr */ NONE,
 389};
 390
 391static void print_usage(int /* argc */, char ** argv) {
 392    printf("usage: %s [options]\n", argv[0]);
 393    printf("\n");
 394    printf("options:\n");
 395    printf("  -h, --help\n");
 396    printf("  --numa <distribute|isolate|numactl>       numa mode (default: disabled)\n");
 397    printf("  -r, --repetitions <n>                     number of times to repeat each test (default: %d)\n",
 398           cmd_params_defaults.reps);
 399    printf("  --prio <-1|0|1|2|3>                          process/thread priority (default: %d)\n",
 400           cmd_params_defaults.prio);
 401    printf("  --delay <0...N> (seconds)                 delay between each test (default: %d)\n",
 402           cmd_params_defaults.delay);
 403    printf("  -o, --output <csv|json|jsonl|md|sql>      output format printed to stdout (default: %s)\n",
 404           output_format_str(cmd_params_defaults.output_format));
 405    printf("  -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
 406           output_format_str(cmd_params_defaults.output_format_stderr));
 407    printf("  --list-devices                            list available devices and exit\n");
 408    printf("  -v, --verbose                             verbose output\n");
 409    printf("  --progress                                print test progress indicators\n");
 410    printf("  --no-warmup                               skip warmup runs before benchmarking\n");
 411    if (llama_supports_rpc()) {
 412        printf("  -rpc, --rpc <rpc_servers>                 register RPC devices (comma separated)\n");
 413    }
 414    printf("\n");
 415    printf("test parameters:\n");
 416    printf("  -m, --model <filename>                    (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
 417    printf("  -p, --n-prompt <n>                        (default: %s)\n",
 418           join(cmd_params_defaults.n_prompt, ",").c_str());
 419    printf("  -n, --n-gen <n>                           (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
 420    printf("  -pg <pp,tg>                               (default: %s)\n",
 421           join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
 422    printf("  -d, --n-depth <n>                         (default: %s)\n",
 423           join(cmd_params_defaults.n_depth, ",").c_str());
 424    printf("  -b, --batch-size <n>                      (default: %s)\n",
 425           join(cmd_params_defaults.n_batch, ",").c_str());
 426    printf("  -ub, --ubatch-size <n>                    (default: %s)\n",
 427           join(cmd_params_defaults.n_ubatch, ",").c_str());
 428    printf("  -ctk, --cache-type-k <t>                  (default: %s)\n",
 429           join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
 430    printf("  -ctv, --cache-type-v <t>                  (default: %s)\n",
 431           join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
 432    printf("  -t, --threads <n>                         (default: %s)\n",
 433           join(cmd_params_defaults.n_threads, ",").c_str());
 434    printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n",
 435           join(cmd_params_defaults.cpu_mask, ",").c_str());
 436    printf("  --cpu-strict <0|1>                        (default: %s)\n",
 437           join(cmd_params_defaults.cpu_strict, ",").c_str());
 438    printf("  --poll <0...100>                          (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
 439    printf("  -ngl, --n-gpu-layers <n>                  (default: %s)\n",
 440           join(cmd_params_defaults.n_gpu_layers, ",").c_str());
 441    printf("  -ncmoe, --n-cpu-moe <n>                   (default: %s)\n",
 442           join(cmd_params_defaults.n_cpu_moe, ",").c_str());
 443    printf("  -sm, --split-mode <none|layer|row>        (default: %s)\n",
 444           join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
 445    printf("  -mg, --main-gpu <i>                       (default: %s)\n",
 446           join(cmd_params_defaults.main_gpu, ",").c_str());
 447    printf("  -nkvo, --no-kv-offload <0|1>              (default: %s)\n",
 448           join(cmd_params_defaults.no_kv_offload, ",").c_str());
 449    printf("  -fa, --flash-attn <0|1>                   (default: %s)\n",
 450           join(cmd_params_defaults.flash_attn, ",").c_str());
 451    printf("  -dev, --device <dev0/dev1/...>            (default: auto)\n");
 452    printf("  -mmp, --mmap <0|1>                        (default: %s)\n",
 453           join(cmd_params_defaults.use_mmap, ",").c_str());
 454    printf("  -dio, --direct-io <0|1>                   (default: %s)\n",
 455           join(cmd_params_defaults.use_direct_io, ",").c_str());
 456    printf("  -embd, --embeddings <0|1>                 (default: %s)\n",
 457           join(cmd_params_defaults.embeddings, ",").c_str());
 458    printf("  -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n");
 459    printf("  -ot --override-tensor <tensor name pattern>=<buffer type>;...\n");
 460    printf("                                            (default: disabled)\n");
 461    printf("  -nopo, --no-op-offload <0|1>              (default: 0)\n");
 462    printf("  --no-host <0|1>                           (default: %s)\n",
 463           join(cmd_params_defaults.no_host, ",").c_str());
 464    printf("\n");
 465    printf(
 466        "Multiple values can be given for each parameter by separating them with ','\n"
 467        "or by specifying the parameter multiple times. Ranges can be given as\n"
 468        "'first-last' or 'first-last+step' or 'first-last*mult'.\n");
 469}
 470
 471static ggml_type ggml_type_from_name(const std::string & s) {
 472    if (s == "f16") {
 473        return GGML_TYPE_F16;
 474    }
 475    if (s == "bf16") {
 476        return GGML_TYPE_BF16;
 477    }
 478    if (s == "q8_0") {
 479        return GGML_TYPE_Q8_0;
 480    }
 481    if (s == "q4_0") {
 482        return GGML_TYPE_Q4_0;
 483    }
 484    if (s == "q4_1") {
 485        return GGML_TYPE_Q4_1;
 486    }
 487    if (s == "q5_0") {
 488        return GGML_TYPE_Q5_0;
 489    }
 490    if (s == "q5_1") {
 491        return GGML_TYPE_Q5_1;
 492    }
 493    if (s == "iq4_nl") {
 494        return GGML_TYPE_IQ4_NL;
 495    }
 496
 497    return GGML_TYPE_COUNT;
 498}
 499
 500static cmd_params parse_cmd_params(int argc, char ** argv) {
 501    cmd_params        params;
 502    std::string       arg;
 503    bool              invalid_param = false;
 504    const std::string arg_prefix    = "--";
 505    const char        split_delim   = ',';
 506
 507    params.verbose              = cmd_params_defaults.verbose;
 508    params.output_format        = cmd_params_defaults.output_format;
 509    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
 510    params.reps                 = cmd_params_defaults.reps;
 511    params.numa                 = cmd_params_defaults.numa;
 512    params.prio                 = cmd_params_defaults.prio;
 513    params.delay                = cmd_params_defaults.delay;
 514    params.progress             = cmd_params_defaults.progress;
 515    params.no_warmup            = cmd_params_defaults.no_warmup;
 516
 517    for (int i = 1; i < argc; i++) {
 518        arg = argv[i];
 519        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
 520            std::replace(arg.begin(), arg.end(), '_', '-');
 521        }
 522
 523        try {
 524            if (arg == "-h" || arg == "--help") {
 525                print_usage(argc, argv);
 526                exit(0);
 527            } else if (arg == "-m" || arg == "--model") {
 528                if (++i >= argc) {
 529                    invalid_param = true;
 530                    break;
 531                }
 532                auto p = string_split<std::string>(argv[i], split_delim);
 533                params.model.insert(params.model.end(), p.begin(), p.end());
 534            } else if (arg == "-p" || arg == "--n-prompt") {
 535                if (++i >= argc) {
 536                    invalid_param = true;
 537                    break;
 538                }
 539                auto p = parse_int_range(argv[i]);
 540                params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
 541            } else if (arg == "-n" || arg == "--n-gen") {
 542                if (++i >= argc) {
 543                    invalid_param = true;
 544                    break;
 545                }
 546                auto p = parse_int_range(argv[i]);
 547                params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
 548            } else if (arg == "-pg") {
 549                if (++i >= argc) {
 550                    invalid_param = true;
 551                    break;
 552                }
 553                auto p = string_split<std::string>(argv[i], ',');
 554                if (p.size() != 2) {
 555                    invalid_param = true;
 556                    break;
 557                }
 558                params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
 559            } else if (arg == "-d" || arg == "--n-depth") {
 560                if (++i >= argc) {
 561                    invalid_param = true;
 562                    break;
 563                }
 564                auto p = parse_int_range(argv[i]);
 565                params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
 566            } else if (arg == "-b" || arg == "--batch-size") {
 567                if (++i >= argc) {
 568                    invalid_param = true;
 569                    break;
 570                }
 571                auto p = parse_int_range(argv[i]);
 572                params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
 573            } else if (arg == "-ub" || arg == "--ubatch-size") {
 574                if (++i >= argc) {
 575                    invalid_param = true;
 576                    break;
 577                }
 578                auto p = parse_int_range(argv[i]);
 579                params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
 580            } else if (arg == "-ctk" || arg == "--cache-type-k") {
 581                if (++i >= argc) {
 582                    invalid_param = true;
 583                    break;
 584                }
 585                auto p = string_split<std::string>(argv[i], split_delim);
 586
 587                std::vector<ggml_type> types;
 588                for (const auto & t : p) {
 589                    ggml_type gt = ggml_type_from_name(t);
 590                    if (gt == GGML_TYPE_COUNT) {
 591                        invalid_param = true;
 592                        break;
 593                    }
 594                    types.push_back(gt);
 595                }
 596                if (invalid_param) {
 597                    break;
 598                }
 599                params.type_k.insert(params.type_k.end(), types.begin(), types.end());
 600            } else if (arg == "-ctv" || arg == "--cache-type-v") {
 601                if (++i >= argc) {
 602                    invalid_param = true;
 603                    break;
 604                }
 605                auto p = string_split<std::string>(argv[i], split_delim);
 606
 607                std::vector<ggml_type> types;
 608                for (const auto & t : p) {
 609                    ggml_type gt = ggml_type_from_name(t);
 610                    if (gt == GGML_TYPE_COUNT) {
 611                        invalid_param = true;
 612                        break;
 613                    }
 614                    types.push_back(gt);
 615                }
 616                if (invalid_param) {
 617                    break;
 618                }
 619                params.type_v.insert(params.type_v.end(), types.begin(), types.end());
 620            } else if (arg == "-dev" || arg == "--device") {
 621                if (++i >= argc) {
 622                    invalid_param = true;
 623                    break;
 624                }
 625                auto combos = string_split<std::string>(argv[i], split_delim);
 626                for (const auto & combo : combos) {
 627                    try {
 628                        params.devices.push_back(parse_devices_arg(combo));
 629                    } catch (const std::exception & e) {
 630                        fprintf(stderr, "error: %s\n", e.what());
 631                        invalid_param = true;
 632                        break;
 633                    }
 634                }
 635                if (invalid_param) {
 636                    break;
 637                }
 638            } else if (arg == "--list-devices") {
 639                std::vector<ggml_backend_dev_t> devices;
 640                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
 641                    auto * dev = ggml_backend_dev_get(i);
 642                    if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
 643                        devices.push_back(dev);
 644                    }
 645                }
 646                printf("Available devices:\n");
 647                if (devices.empty()) {
 648                    printf("  (none)\n");
 649                }
 650                for (auto * dev : devices) {
 651                    size_t free, total;
 652                    ggml_backend_dev_memory(dev, &free, &total);
 653                    printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
 654                }
 655                exit(0);
 656            } else if (arg == "-t" || arg == "--threads") {
 657                if (++i >= argc) {
 658                    invalid_param = true;
 659                    break;
 660                }
 661                auto p = parse_int_range(argv[i]);
 662                params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
 663            } else if (arg == "-C" || arg == "--cpu-mask") {
 664                if (++i >= argc) {
 665                    invalid_param = true;
 666                    break;
 667                }
 668                auto p = string_split<std::string>(argv[i], split_delim);
 669                params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
 670            } else if (arg == "--cpu-strict") {
 671                if (++i >= argc) {
 672                    invalid_param = true;
 673                    break;
 674                }
 675                auto p = string_split<bool>(argv[i], split_delim);
 676                params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
 677            } else if (arg == "--poll") {
 678                if (++i >= argc) {
 679                    invalid_param = true;
 680                    break;
 681                }
 682                auto p = parse_int_range(argv[i]);
 683                params.poll.insert(params.poll.end(), p.begin(), p.end());
 684            } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
 685                if (++i >= argc) {
 686                    invalid_param = true;
 687                    break;
 688                }
 689                auto p = parse_int_range(argv[i]);
 690                params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
 691            } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") {
 692                if (++i >= argc) {
 693                    invalid_param = true;
 694                    break;
 695                }
 696                auto p = parse_int_range(argv[i]);
 697                params.n_cpu_moe.insert(params.n_cpu_moe.end(), p.begin(), p.end());
 698            } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
 699                if (++i >= argc) {
 700                    invalid_param = true;
 701                    break;
 702                }
 703                try {
 704                    register_rpc_server_list(argv[i]);
 705                } catch (const std::exception & e) {
 706                    fprintf(stderr, "error: %s\n", e.what());
 707                    invalid_param = true;
 708                    break;
 709                }
 710            } else if (arg == "-sm" || arg == "--split-mode") {
 711                if (++i >= argc) {
 712                    invalid_param = true;
 713                    break;
 714                }
 715                auto p = string_split<std::string>(argv[i], split_delim);
 716
 717                std::vector<llama_split_mode> modes;
 718                for (const auto & m : p) {
 719                    llama_split_mode mode;
 720                    if (m == "none") {
 721                        mode = LLAMA_SPLIT_MODE_NONE;
 722                    } else if (m == "layer") {
 723                        mode = LLAMA_SPLIT_MODE_LAYER;
 724                    } else if (m == "row") {
 725                        mode = LLAMA_SPLIT_MODE_ROW;
 726                    } else {
 727                        invalid_param = true;
 728                        break;
 729                    }
 730                    modes.push_back(mode);
 731                }
 732                if (invalid_param) {
 733                    break;
 734                }
 735                params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
 736            } else if (arg == "-mg" || arg == "--main-gpu") {
 737                if (++i >= argc) {
 738                    invalid_param = true;
 739                    break;
 740                }
 741                params.main_gpu = parse_int_range(argv[i]);
 742            } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
 743                if (++i >= argc) {
 744                    invalid_param = true;
 745                    break;
 746                }
 747                auto p = string_split<bool>(argv[i], split_delim);
 748                params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
 749            } else if (arg == "--numa") {
 750                if (++i >= argc) {
 751                    invalid_param = true;
 752                    break;
 753                }
 754                std::string value(argv[i]);
 755                if (value == "distribute" || value == "") {
 756                    params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
 757                } else if (value == "isolate") {
 758                    params.numa = GGML_NUMA_STRATEGY_ISOLATE;
 759                } else if (value == "numactl") {
 760                    params.numa = GGML_NUMA_STRATEGY_NUMACTL;
 761                } else {
 762                    invalid_param = true;
 763                    break;
 764                }
 765            } else if (arg == "-fa" || arg == "--flash-attn") {
 766                if (++i >= argc) {
 767                    invalid_param = true;
 768                    break;
 769                }
 770                auto p = string_split<bool>(argv[i], split_delim);
 771                params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
 772            } else if (arg == "-mmp" || arg == "--mmap") {
 773                if (++i >= argc) {
 774                    invalid_param = true;
 775                    break;
 776                }
 777                auto p = string_split<bool>(argv[i], split_delim);
 778                params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
 779            } else if (arg == "-dio" || arg == "--direct-io") {
 780                if (++i >= argc) {
 781                    invalid_param = true;
 782                    break;
 783                }
 784                auto p = string_split<bool>(argv[i], split_delim);
 785                params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
 786            } else if (arg == "-embd" || arg == "--embeddings") {
 787                if (++i >= argc) {
 788                    invalid_param = true;
 789                    break;
 790                }
 791                auto p = string_split<bool>(argv[i], split_delim);
 792                params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
 793            } else if (arg == "-nopo" || arg == "--no-op-offload") {
 794                if (++i >= argc) {
 795                    invalid_param = true;
 796                    break;
 797                }
 798                auto p = string_split<bool>(argv[i], split_delim);
 799                params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
 800            } else if (arg == "--no-host") {
 801                if (++i >= argc) {
 802                    invalid_param = true;
 803                    break;
 804                }
 805                auto p = string_split<bool>(argv[i], split_delim);
 806                params.no_host.insert(params.no_host.end(), p.begin(), p.end());
 807            } else if (arg == "-ts" || arg == "--tensor-split") {
 808                if (++i >= argc) {
 809                    invalid_param = true;
 810                    break;
 811                }
 812                for (auto ts : string_split<std::string>(argv[i], split_delim)) {
 813                    // split string by ; and /
 814                    const std::regex           regex{ R"([;/]+)" };
 815                    std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
 816                    std::vector<std::string>   split_arg{ it, {} };
 817                    GGML_ASSERT(split_arg.size() <= llama_max_devices());
 818
 819                    std::vector<float> tensor_split(llama_max_devices());
 820                    for (size_t i = 0; i < llama_max_devices(); ++i) {
 821                        if (i < split_arg.size()) {
 822                            tensor_split[i] = std::stof(split_arg[i]);
 823                        } else {
 824                            tensor_split[i] = 0.0f;
 825                        }
 826                    }
 827                    params.tensor_split.push_back(tensor_split);
 828                }
 829            } else if (arg == "-ot" || arg == "--override-tensor") {
 830                if (++i >= argc) {
 831                    invalid_param = true;
 832                    break;
 833                }
 834                auto * value = argv[i];
 835                /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
 836                if (buft_list.empty()) {
 837                    // enumerate all the devices and add their buffer types to the list
 838                    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
 839                        auto * dev = ggml_backend_dev_get(i);
 840                        auto * buft = ggml_backend_dev_buffer_type(dev);
 841                        if (buft) {
 842                            buft_list[ggml_backend_buft_name(buft)] = buft;
 843                        }
 844                    }
 845                }
 846                auto override_group_span_len = std::strcspn(value, ",");
 847                bool last_group = false;
 848                do {
 849                    if (override_group_span_len == 0) {
 850                        // Adds an empty override-tensors for an empty span
 851                        params.tensor_buft_overrides.push_back({{}});
 852                        if (value[override_group_span_len] == '\0') {
 853                            value = &value[override_group_span_len];
 854                            last_group = true;
 855                        } else {
 856                            value = &value[override_group_span_len + 1];
 857                            override_group_span_len = std::strcspn(value, ",");
 858                        }
 859                        continue;
 860                    }
 861                    // Stamps null terminators into the argv
 862                    // value for this option to avoid the
 863                    // memory leak present in the implementation
 864                    // over in arg.cpp. Acceptable because we
 865                    // only parse these args once in this program.
 866                    auto * override_group = value;
 867                    if (value[override_group_span_len] == '\0') {
 868                        value = &value[override_group_span_len];
 869                        last_group = true;
 870                    } else {
 871                        value[override_group_span_len] = '\0';
 872                        value = &value[override_group_span_len + 1];
 873                    }
 874                    std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
 875                    auto override_span_len = std::strcspn(override_group, ";");
 876                    while (override_span_len > 0) {
 877                        auto * override = override_group;
 878                        if (override_group[override_span_len] != '\0') {
 879                            override_group[override_span_len] = '\0';
 880                            override_group = &override_group[override_span_len + 1];
 881                        } else {
 882                            override_group = &override_group[override_span_len];
 883                        }
 884                        auto tensor_name_span_len = std::strcspn(override, "=");
 885                        if (tensor_name_span_len >= override_span_len) {
 886                            invalid_param = true;
 887                            break;
 888                        }
 889                        override[tensor_name_span_len] = '\0';
 890                        auto * tensor_name = override;
 891                        auto * buffer_type = &override[tensor_name_span_len + 1];
 892                        if (buft_list.find(buffer_type) == buft_list.end()) {
 893                            printf("error: unrecognized buffer type '%s'\n", buffer_type);
 894                            printf("Available buffer types:\n");
 895                            for (const auto & it : buft_list) {
 896                                printf("  %s\n", ggml_backend_buft_name(it.second));
 897                            }
 898                            invalid_param = true;
 899                            break;
 900                        }
 901                        group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
 902                        override_span_len = std::strcspn(override_group, ";");
 903                    }
 904                    if (invalid_param) {
 905                        break;
 906                    }
 907                    group_tensor_buft_overrides.push_back({nullptr,nullptr});
 908                    params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
 909                    override_group_span_len = std::strcspn(value, ",");
 910                } while (!last_group);
 911            } else if (arg == "-r" || arg == "--repetitions") {
 912                if (++i >= argc) {
 913                    invalid_param = true;
 914                    break;
 915                }
 916                params.reps = std::stoi(argv[i]);
 917            } else if (arg == "--prio") {
 918                if (++i >= argc) {
 919                    invalid_param = true;
 920                    break;
 921                }
 922                params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
 923            } else if (arg == "--delay") {
 924                if (++i >= argc) {
 925                    invalid_param = true;
 926                    break;
 927                }
 928                params.delay = std::stoi(argv[i]);
 929            } else if (arg == "-o" || arg == "--output") {
 930                if (++i >= argc) {
 931                    invalid_param = true;
 932                    break;
 933                }
 934                invalid_param = !output_format_from_str(argv[i], params.output_format);
 935            } else if (arg == "-oe" || arg == "--output-err") {
 936                if (++i >= argc) {
 937                    invalid_param = true;
 938                    break;
 939                }
 940                invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
 941            } else if (arg == "-v" || arg == "--verbose") {
 942                params.verbose = true;
 943            } else if (arg == "--progress") {
 944                params.progress = true;
 945            } else if (arg == "--no-warmup") {
 946                params.no_warmup = true;
 947            } else {
 948                invalid_param = true;
 949                break;
 950            }
 951        } catch (const std::exception & e) {
 952            fprintf(stderr, "error: %s\n", e.what());
 953            invalid_param = true;
 954            break;
 955        }
 956    }
 957
 958    if (invalid_param) {
 959        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
 960        print_usage(argc, argv);
 961        exit(1);
 962    }
 963
 964    // set defaults
 965    if (params.model.empty()) {
 966        params.model = cmd_params_defaults.model;
 967    }
 968    if (params.n_prompt.empty()) {
 969        params.n_prompt = cmd_params_defaults.n_prompt;
 970    }
 971    if (params.n_gen.empty()) {
 972        params.n_gen = cmd_params_defaults.n_gen;
 973    }
 974    if (params.n_pg.empty()) {
 975        params.n_pg = cmd_params_defaults.n_pg;
 976    }
 977    if (params.n_depth.empty()) {
 978        params.n_depth = cmd_params_defaults.n_depth;
 979    }
 980    if (params.n_batch.empty()) {
 981        params.n_batch = cmd_params_defaults.n_batch;
 982    }
 983    if (params.n_ubatch.empty()) {
 984        params.n_ubatch = cmd_params_defaults.n_ubatch;
 985    }
 986    if (params.type_k.empty()) {
 987        params.type_k = cmd_params_defaults.type_k;
 988    }
 989    if (params.type_v.empty()) {
 990        params.type_v = cmd_params_defaults.type_v;
 991    }
 992    if (params.n_gpu_layers.empty()) {
 993        params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
 994    }
 995    if (params.n_cpu_moe.empty()) {
 996        params.n_cpu_moe = cmd_params_defaults.n_cpu_moe;
 997    }
 998    if (params.split_mode.empty()) {
 999        params.split_mode = cmd_params_defaults.split_mode;
1000    }
1001    if (params.main_gpu.empty()) {
1002        params.main_gpu = cmd_params_defaults.main_gpu;
1003    }
1004    if (params.no_kv_offload.empty()) {
1005        params.no_kv_offload = cmd_params_defaults.no_kv_offload;
1006    }
1007    if (params.flash_attn.empty()) {
1008        params.flash_attn = cmd_params_defaults.flash_attn;
1009    }
1010    if (params.devices.empty()) {
1011        params.devices = cmd_params_defaults.devices;
1012    }
1013    if (params.tensor_split.empty()) {
1014        params.tensor_split = cmd_params_defaults.tensor_split;
1015    }
1016    if (params.tensor_buft_overrides.empty()) {
1017        params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
1018    }
1019    if (params.use_mmap.empty()) {
1020        params.use_mmap = cmd_params_defaults.use_mmap;
1021    }
1022    if (params.use_direct_io.empty()) {
1023        params.use_direct_io = cmd_params_defaults.use_direct_io;
1024    }
1025    if (params.embeddings.empty()) {
1026        params.embeddings = cmd_params_defaults.embeddings;
1027    }
1028    if (params.no_op_offload.empty()) {
1029        params.no_op_offload = cmd_params_defaults.no_op_offload;
1030    }
1031    if (params.no_host.empty()) {
1032        params.no_host = cmd_params_defaults.no_host;
1033    }
1034    if (params.n_threads.empty()) {
1035        params.n_threads = cmd_params_defaults.n_threads;
1036    }
1037    if (params.cpu_mask.empty()) {
1038        params.cpu_mask = cmd_params_defaults.cpu_mask;
1039    }
1040    if (params.cpu_strict.empty()) {
1041        params.cpu_strict = cmd_params_defaults.cpu_strict;
1042    }
1043    if (params.poll.empty()) {
1044        params.poll = cmd_params_defaults.poll;
1045    }
1046
1047    return params;
1048}
1049
1050struct cmd_params_instance {
1051    std::string        model;
1052    int                n_prompt;
1053    int                n_gen;
1054    int                n_depth;
1055    int                n_batch;
1056    int                n_ubatch;
1057    ggml_type          type_k;
1058    ggml_type          type_v;
1059    int                n_threads;
1060    std::string        cpu_mask;
1061    bool               cpu_strict;
1062    int                poll;
1063    int                n_gpu_layers;
1064    int                n_cpu_moe;
1065    llama_split_mode   split_mode;
1066    int                main_gpu;
1067    bool               no_kv_offload;
1068    bool               flash_attn;
1069    std::vector<ggml_backend_dev_t> devices;
1070    std::vector<float> tensor_split;
1071    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
1072    bool               use_mmap;
1073    bool               use_direct_io;
1074    bool               embeddings;
1075    bool               no_op_offload;
1076    bool               no_host;
1077
1078    llama_model_params to_llama_mparams() const {
1079        llama_model_params mparams = llama_model_default_params();
1080
1081        mparams.n_gpu_layers = n_gpu_layers;
1082        if (!devices.empty()) {
1083            mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
1084        }
1085        mparams.split_mode    = split_mode;
1086        mparams.main_gpu      = main_gpu;
1087        mparams.tensor_split  = tensor_split.data();
1088        mparams.use_mmap      = use_mmap;
1089        mparams.use_direct_io = use_direct_io;
1090        mparams.no_host       = no_host;
1091
1092        if (n_cpu_moe <= 0) {
1093            if (tensor_buft_overrides.empty()) {
1094                mparams.tensor_buft_overrides = nullptr;
1095            } else {
1096                GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr &&
1097                            "Tensor buffer overrides not terminated with empty pattern");
1098                mparams.tensor_buft_overrides = tensor_buft_overrides.data();
1099            }
1100        } else {
1101            static std::vector<llama_model_tensor_buft_override> merged;
1102            static std::vector<std::string> patterns;
1103
1104            merged.clear();
1105            patterns.clear();
1106
1107            auto first = tensor_buft_overrides.begin();
1108            auto last  = tensor_buft_overrides.end();
1109            if (first != last && (last - 1)->pattern == nullptr) {
1110                --last;
1111            }
1112            merged.insert(merged.end(), first, last);
1113
1114            patterns.reserve((size_t) n_cpu_moe);
1115            merged.reserve(merged.size() + (size_t) n_cpu_moe + 1);
1116
1117            for (int i = 0; i < n_cpu_moe; ++i) {
1118                patterns.push_back(llm_ffn_exps_block_regex(i));
1119                merged.push_back({ patterns.back().c_str(),
1120                                ggml_backend_cpu_buffer_type() });
1121            }
1122
1123            merged.push_back({ nullptr, nullptr });
1124
1125            mparams.tensor_buft_overrides = merged.data();
1126        }
1127
1128        return mparams;
1129    }
1130
1131    bool equal_mparams(const cmd_params_instance & other) const {
1132        return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
1133               split_mode == other.split_mode &&
1134               main_gpu == other.main_gpu && tensor_split == other.tensor_split &&
1135               use_mmap == other.use_mmap && use_direct_io == other.use_direct_io &&
1136               devices == other.devices &&
1137               no_host == other.no_host &&
1138               vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
1139    }
1140
1141    llama_context_params to_llama_cparams() const {
1142        llama_context_params cparams = llama_context_default_params();
1143
1144        cparams.n_ctx           = n_prompt + n_gen + n_depth;
1145        cparams.n_batch         = n_batch;
1146        cparams.n_ubatch        = n_ubatch;
1147        cparams.type_k          = type_k;
1148        cparams.type_v          = type_v;
1149        cparams.offload_kqv     = !no_kv_offload;
1150        cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
1151        cparams.embeddings      = embeddings;
1152        cparams.op_offload      = !no_op_offload;
1153        cparams.swa_full        = false;
1154
1155        return cparams;
1156    }
1157};
1158
1159static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
1160    std::vector<cmd_params_instance> instances;
1161
1162    // this ordering minimizes the number of times that each model needs to be reloaded
1163    // clang-format off
1164    for (const auto & m : params.model)
1165    for (const auto & nl : params.n_gpu_layers)
1166    for (const auto & ncmoe : params.n_cpu_moe)
1167    for (const auto & sm : params.split_mode)
1168    for (const auto & mg : params.main_gpu)
1169    for (const auto & devs : params.devices)
1170    for (const auto & ts : params.tensor_split)
1171    for (const auto & ot : params.tensor_buft_overrides)
1172    for (const auto & mmp : params.use_mmap)
1173    for (const auto & dio : params.use_direct_io)
1174    for (const auto & noh : params.no_host)
1175    for (const auto & embd : params.embeddings)
1176    for (const auto & nopo : params.no_op_offload)
1177    for (const auto & nb : params.n_batch)
1178    for (const auto & nub : params.n_ubatch)
1179    for (const auto & tk : params.type_k)
1180    for (const auto & tv : params.type_v)
1181    for (const auto & nkvo : params.no_kv_offload)
1182    for (const auto & fa : params.flash_attn)
1183    for (const auto & nt : params.n_threads)
1184    for (const auto & cm : params.cpu_mask)
1185    for (const auto & cs : params.cpu_strict)
1186    for (const auto & nd : params.n_depth)
1187    for (const auto & pl : params.poll) {
1188        for (const auto & n_prompt : params.n_prompt) {
1189            if (n_prompt == 0) {
1190                continue;
1191            }
1192            cmd_params_instance instance = {
1193                /* .model        = */ m,
1194                /* .n_prompt     = */ n_prompt,
1195                /* .n_gen        = */ 0,
1196                /* .n_depth      = */ nd,
1197                /* .n_batch      = */ nb,
1198                /* .n_ubatch     = */ nub,
1199                /* .type_k       = */ tk,
1200                /* .type_v       = */ tv,
1201                /* .n_threads    = */ nt,
1202                /* .cpu_mask     = */ cm,
1203                /* .cpu_strict   = */ cs,
1204                /* .poll         = */ pl,
1205                /* .n_gpu_layers = */ nl,
1206                /* .n_cpu_moe    = */ ncmoe,
1207                /* .split_mode   = */ sm,
1208                /* .main_gpu     = */ mg,
1209                /* .no_kv_offload= */ nkvo,
1210                /* .flash_attn   = */ fa,
1211                /* .devices      = */ devs,
1212                /* .tensor_split = */ ts,
1213                /* .tensor_buft_overrides = */ ot,
1214                /* .use_mmap     = */ mmp,
1215                /* .use_direct_io= */ dio,
1216                /* .embeddings   = */ embd,
1217                /* .no_op_offload= */ nopo,
1218                /* .no_host      = */ noh,
1219            };
1220            instances.push_back(instance);
1221        }
1222
1223        for (const auto & n_gen : params.n_gen) {
1224            if (n_gen == 0) {
1225                continue;
1226            }
1227            cmd_params_instance instance = {
1228                /* .model        = */ m,
1229                /* .n_prompt     = */ 0,
1230                /* .n_gen        = */ n_gen,
1231                /* .n_depth      = */ nd,
1232                /* .n_batch      = */ nb,
1233                /* .n_ubatch     = */ nub,
1234                /* .type_k       = */ tk,
1235                /* .type_v       = */ tv,
1236                /* .n_threads    = */ nt,
1237                /* .cpu_mask     = */ cm,
1238                /* .cpu_strict   = */ cs,
1239                /* .poll         = */ pl,
1240                /* .n_gpu_layers = */ nl,
1241                /* .n_cpu_moe    = */ ncmoe,
1242                /* .split_mode   = */ sm,
1243                /* .main_gpu     = */ mg,
1244                /* .no_kv_offload= */ nkvo,
1245                /* .flash_attn   = */ fa,
1246                /* .devices      = */ devs,
1247                /* .tensor_split = */ ts,
1248                /* .tensor_buft_overrides = */ ot,
1249                /* .use_mmap     = */ mmp,
1250                /* .use_direct_io= */ dio,
1251                /* .embeddings   = */ embd,
1252                /* .no_op_offload= */ nopo,
1253                /* .no_host      = */ noh,
1254            };
1255            instances.push_back(instance);
1256        }
1257
1258        for (const auto & n_pg : params.n_pg) {
1259            if (n_pg.first == 0 && n_pg.second == 0) {
1260                continue;
1261            }
1262            cmd_params_instance instance = {
1263                /* .model        = */ m,
1264                /* .n_prompt     = */ n_pg.first,
1265                /* .n_gen        = */ n_pg.second,
1266                /* .n_depth      = */ nd,
1267                /* .n_batch      = */ nb,
1268                /* .n_ubatch     = */ nub,
1269                /* .type_k       = */ tk,
1270                /* .type_v       = */ tv,
1271                /* .n_threads    = */ nt,
1272                /* .cpu_mask     = */ cm,
1273                /* .cpu_strict   = */ cs,
1274                /* .poll         = */ pl,
1275                /* .n_gpu_layers = */ nl,
1276                /* .n_cpu_moe    = */ ncmoe,
1277                /* .split_mode   = */ sm,
1278                /* .main_gpu     = */ mg,
1279                /* .no_kv_offload= */ nkvo,
1280                /* .flash_attn   = */ fa,
1281                /* .devices      = */ devs,
1282                /* .tensor_split = */ ts,
1283                /* .tensor_buft_overrides = */ ot,
1284                /* .use_mmap     = */ mmp,
1285                /* .use_direct_io= */ dio,
1286                /* .embeddings   = */ embd,
1287                /* .no_op_offload= */ nopo,
1288                /* .no_host      = */ noh,
1289            };
1290            instances.push_back(instance);
1291        }
1292    }
1293    // clang-format on
1294
1295    return instances;
1296}
1297
1298struct test {
1299    static const std::string build_commit;
1300    static const int         build_number;
1301    const std::string        cpu_info;
1302    const std::string        gpu_info;
1303    std::string              model_filename;
1304    std::string              model_type;
1305    uint64_t                 model_size;
1306    uint64_t                 model_n_params;
1307    int                      n_batch;
1308    int                      n_ubatch;
1309    int                      n_threads;
1310    std::string              cpu_mask;
1311    bool                     cpu_strict;
1312    int                      poll;
1313    ggml_type                type_k;
1314    ggml_type                type_v;
1315    int                      n_gpu_layers;
1316    int                      n_cpu_moe;
1317    llama_split_mode         split_mode;
1318    int                      main_gpu;
1319    bool                     no_kv_offload;
1320    bool                     flash_attn;
1321    std::vector<ggml_backend_dev_t> devices;
1322    std::vector<float>       tensor_split;
1323    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
1324    bool                     use_mmap;
1325    bool                     use_direct_io;
1326    bool                     embeddings;
1327    bool                     no_op_offload;
1328    bool                     no_host;
1329    int                      n_prompt;
1330    int                      n_gen;
1331    int                      n_depth;
1332    std::string              test_time;
1333    std::vector<uint64_t>    samples_ns;
1334
1335    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
1336        cpu_info(get_cpu_info()),
1337        gpu_info(get_gpu_info()) {
1338
1339        model_filename = inst.model;
1340        char buf[128];
1341        llama_model_desc(lmodel, buf, sizeof(buf));
1342        model_type     = buf;
1343        model_size     = llama_model_size(lmodel);
1344        model_n_params = llama_model_n_params(lmodel);
1345        n_batch        = inst.n_batch;
1346        n_ubatch       = inst.n_ubatch;
1347        n_threads      = inst.n_threads;
1348        cpu_mask       = inst.cpu_mask;
1349        cpu_strict     = inst.cpu_strict;
1350        poll           = inst.poll;
1351        type_k         = inst.type_k;
1352        type_v         = inst.type_v;
1353        n_gpu_layers   = inst.n_gpu_layers;
1354        n_cpu_moe      = inst.n_cpu_moe;
1355        split_mode     = inst.split_mode;
1356        main_gpu       = inst.main_gpu;
1357        no_kv_offload  = inst.no_kv_offload;
1358        flash_attn     = inst.flash_attn;
1359        devices        = inst.devices;
1360        tensor_split   = inst.tensor_split;
1361        tensor_buft_overrides = inst.tensor_buft_overrides;
1362        use_mmap       = inst.use_mmap;
1363        use_direct_io  = inst.use_direct_io;
1364        embeddings     = inst.embeddings;
1365        no_op_offload  = inst.no_op_offload;
1366        no_host        = inst.no_host;
1367        n_prompt       = inst.n_prompt;
1368        n_gen          = inst.n_gen;
1369        n_depth        = inst.n_depth;
1370        // RFC 3339 date-time format
1371        time_t t       = time(NULL);
1372        std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
1373        test_time = buf;
1374
1375        (void) ctx;
1376    }
1377
1378    uint64_t avg_ns() const { return ::avg(samples_ns); }
1379
1380    uint64_t stdev_ns() const { return ::stdev(samples_ns); }
1381
1382    std::vector<double> get_ts() const {
1383        int                 n_tokens = n_prompt + n_gen;
1384        std::vector<double> ts;
1385        std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
1386                       [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
1387        return ts;
1388    }
1389
1390    double avg_ts() const { return ::avg(get_ts()); }
1391
1392    double stdev_ts() const { return ::stdev(get_ts()); }
1393
1394    static std::string get_backend() {
1395        std::vector<std::string> backends;
1396        bool                     rpc_used = false;
1397        for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
1398            auto *      reg  = ggml_backend_reg_get(i);
1399            std::string name = ggml_backend_reg_name(reg);
1400            if (string_starts_with(name, "RPC")) {
1401                if (ggml_backend_reg_dev_count(reg) > 0) {
1402                    rpc_used = true;
1403                }
1404            } else {
1405                if (name != "CPU") {
1406                    backends.push_back(ggml_backend_reg_name(reg));
1407                }
1408            }
1409        }
1410        if (rpc_used) {
1411            backends.push_back("RPC");
1412        }
1413        return backends.empty() ? "CPU" : join(backends, ",");
1414    }
1415
1416    static const std::vector<std::string> & get_fields() {
1417        static const std::vector<std::string> fields = {
1418            "build_commit",   "build_number",   "cpu_info",      "gpu_info",       "backends",
1419            "model_filename", "model_type",     "model_size",    "model_n_params", "n_batch",
1420            "n_ubatch",       "n_threads",      "cpu_mask",      "cpu_strict",     "poll",
1421            "type_k",         "type_v",         "n_gpu_layers",  "n_cpu_moe",      "split_mode",
1422            "main_gpu",       "no_kv_offload",  "flash_attn",    "devices",        "tensor_split",
1423            "tensor_buft_overrides",            "use_mmap",      "use_direct_io",  "embeddings",
1424            "no_op_offload",  "no_host",        "n_prompt",      "n_gen",          "n_depth",
1425            "test_time",      "avg_ns",         "stddev_ns",     "avg_ts",         "stddev_ts"
1426        };
1427        return fields;
1428    }
1429
1430    enum field_type { STRING, BOOL, INT, FLOAT };
1431
1432    static field_type get_field_type(const std::string & field) {
1433        if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
1434            field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
1435            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" ||
1436            field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") {
1437            return INT;
1438        }
1439        if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
1440            field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") {
1441            return BOOL;
1442        }
1443        if (field == "avg_ts" || field == "stddev_ts") {
1444            return FLOAT;
1445        }
1446        return STRING;
1447    }
1448
1449    std::vector<std::string> get_values() const {
1450        std::string tensor_split_str;
1451        std::string tensor_buft_overrides_str;
1452        int         max_nonzero = 0;
1453        for (size_t i = 0; i < llama_max_devices(); i++) {
1454            if (tensor_split[i] > 0) {
1455                max_nonzero = i;
1456            }
1457        }
1458        for (int i = 0; i <= max_nonzero; i++) {
1459            char buf[32];
1460            snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
1461            tensor_split_str += buf;
1462            if (i < max_nonzero) {
1463                tensor_split_str += "/";
1464            }
1465        }
1466        if (tensor_buft_overrides.size() == 1) {
1467            // Last element of tensor_buft_overrides is always a null pattern
1468            // so if it is only one element long, it must be a null pattern.
1469            GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
1470            tensor_buft_overrides_str += "none";
1471        } else {
1472            for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
1473                // Last element of tensor_buft_overrides is always a null pattern
1474                if (tensor_buft_overrides[i].pattern == nullptr) {
1475                    tensor_buft_overrides_str += "none";
1476                } else {
1477                    tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
1478                    tensor_buft_overrides_str += "=";
1479                    tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
1480                }
1481                if (i + 2 < tensor_buft_overrides.size()) {
1482                    tensor_buft_overrides_str += ";";
1483                }
1484            }
1485        }
1486        std::vector<std::string> values = { build_commit,
1487                                            std::to_string(build_number),
1488                                            cpu_info,
1489                                            gpu_info,
1490                                            get_backend(),
1491                                            model_filename,
1492                                            model_type,
1493                                            std::to_string(model_size),
1494                                            std::to_string(model_n_params),
1495                                            std::to_string(n_batch),
1496                                            std::to_string(n_ubatch),
1497                                            std::to_string(n_threads),
1498                                            cpu_mask,
1499                                            std::to_string(cpu_strict),
1500                                            std::to_string(poll),
1501                                            ggml_type_name(type_k),
1502                                            ggml_type_name(type_v),
1503                                            std::to_string(n_gpu_layers),
1504                                            std::to_string(n_cpu_moe),
1505                                            split_mode_str(split_mode),
1506                                            std::to_string(main_gpu),
1507                                            std::to_string(no_kv_offload),
1508                                            std::to_string(flash_attn),
1509                                            devices_to_string(devices),
1510                                            tensor_split_str,
1511                                            tensor_buft_overrides_str,
1512                                            std::to_string(use_mmap),
1513                                            std::to_string(use_direct_io),
1514                                            std::to_string(embeddings),
1515                                            std::to_string(no_op_offload),
1516                                            std::to_string(no_host),
1517                                            std::to_string(n_prompt),
1518                                            std::to_string(n_gen),
1519                                            std::to_string(n_depth),
1520                                            test_time,
1521                                            std::to_string(avg_ns()),
1522                                            std::to_string(stdev_ns()),
1523                                            std::to_string(avg_ts()),
1524                                            std::to_string(stdev_ts()) };
1525        return values;
1526    }
1527
1528    std::map<std::string, std::string> get_map() const {
1529        std::map<std::string, std::string> map;
1530        auto                               fields = get_fields();
1531        auto                               values = get_values();
1532        std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()),
1533                       std::make_pair<const std::string &, const std::string &>);
1534        return map;
1535    }
1536};
1537
1538const std::string test::build_commit = LLAMA_COMMIT;
1539const int         test::build_number = LLAMA_BUILD_NUMBER;
1540
1541struct printer {
1542    virtual ~printer() {}
1543
1544    FILE * fout;
1545
1546    virtual void print_header(const cmd_params & params) { (void) params; }
1547
1548    virtual void print_test(const test & t) = 0;
1549
1550    virtual void print_footer() {}
1551};
1552
1553struct csv_printer : public printer {
1554    static std::string escape_csv(const std::string & field) {
1555        std::string escaped = "\"";
1556        for (auto c : field) {
1557            if (c == '"') {
1558                escaped += "\"";
1559            }
1560            escaped += c;
1561        }
1562        escaped += "\"";
1563        return escaped;
1564    }
1565
1566    void print_header(const cmd_params & params) override {
1567        std::vector<std::string> fields = test::get_fields();
1568        fprintf(fout, "%s\n", join(fields, ",").c_str());
1569        (void) params;
1570    }
1571
1572    void print_test(const test & t) override {
1573        std::vector<std::string> values = t.get_values();
1574        std::transform(values.begin(), values.end(), values.begin(), escape_csv);
1575        fprintf(fout, "%s\n", join(values, ",").c_str());
1576    }
1577};
1578
1579static std::string escape_json(const std::string & value) {
1580    std::string escaped;
1581    for (auto c : value) {
1582        if (c == '"') {
1583            escaped += "\\\"";
1584        } else if (c == '\\') {
1585            escaped += "\\\\";
1586        } else if (c <= 0x1f) {
1587            char buf[8];
1588            snprintf(buf, sizeof(buf), "\\u%04x", c);
1589            escaped += buf;
1590        } else {
1591            escaped += c;
1592        }
1593    }
1594    return escaped;
1595}
1596
1597static std::string format_json_value(const std::string & field, const std::string & value) {
1598    switch (test::get_field_type(field)) {
1599        case test::STRING:
1600            return "\"" + escape_json(value) + "\"";
1601        case test::BOOL:
1602            return value == "0" ? "false" : "true";
1603        default:
1604            return value;
1605    }
1606}
1607
1608struct json_printer : public printer {
1609    bool first = true;
1610
1611    void print_header(const cmd_params & params) override {
1612        fprintf(fout, "[\n");
1613        (void) params;
1614    }
1615
1616    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1617        assert(fields.size() == values.size());
1618        for (size_t i = 0; i < fields.size(); i++) {
1619            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(),
1620                    format_json_value(fields.at(i), values.at(i)).c_str());
1621        }
1622    }
1623
1624    void print_test(const test & t) override {
1625        if (first) {
1626            first = false;
1627        } else {
1628            fprintf(fout, ",\n");
1629        }
1630        fprintf(fout, "  {\n");
1631        print_fields(test::get_fields(), t.get_values());
1632        fprintf(fout, "    \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
1633        fprintf(fout, "    \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
1634        fprintf(fout, "  }");
1635        fflush(fout);
1636    }
1637
1638    void print_footer() override { fprintf(fout, "\n]\n"); }
1639};
1640
1641struct jsonl_printer : public printer {
1642    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1643        assert(fields.size() == values.size());
1644        for (size_t i = 0; i < fields.size(); i++) {
1645            fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
1646        }
1647    }
1648
1649    void print_test(const test & t) override {
1650        fprintf(fout, "{");
1651        print_fields(test::get_fields(), t.get_values());
1652        fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
1653        fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
1654        fprintf(fout, "}\n");
1655        fflush(fout);
1656    }
1657};
1658
1659struct markdown_printer : public printer {
1660    std::vector<std::string> fields;
1661
1662    static int get_field_width(const std::string & field) {
1663        if (field == "model") {
1664            return -30;
1665        }
1666        if (field == "t/s") {
1667            return 20;
1668        }
1669        if (field == "size" || field == "params") {
1670            return 10;
1671        }
1672        if (field == "n_gpu_layers") {
1673            return 3;
1674        }
1675        if (field == "n_threads") {
1676            return 7;
1677        }
1678        if (field == "n_batch") {
1679            return 7;
1680        }
1681        if (field == "n_ubatch") {
1682            return 8;
1683        }
1684        if (field == "type_k" || field == "type_v") {
1685            return 6;
1686        }
1687        if (field == "split_mode") {
1688            return 5;
1689        }
1690        if (field == "flash_attn") {
1691            return 2;
1692        }
1693        if (field == "devices") {
1694            return -12;
1695        }
1696        if (field == "use_mmap") {
1697            return 4;
1698        }
1699        if (field == "use_direct_io") {
1700            return 3;
1701        }
1702        if (field == "test") {
1703            return 15;
1704        }
1705        if (field == "no_op_offload") {
1706            return 4;
1707        }
1708        if (field == "no_host") {
1709            return 4;
1710        }
1711
1712        int width = std::max((int) field.length(), 10);
1713
1714        if (test::get_field_type(field) == test::STRING) {
1715            return -width;
1716        }
1717        return width;
1718    }
1719
1720    static std::string get_field_display_name(const std::string & field) {
1721        if (field == "n_gpu_layers") {
1722            return "ngl";
1723        }
1724        if (field == "split_mode") {
1725            return "sm";
1726        }
1727        if (field == "n_threads") {
1728            return "threads";
1729        }
1730        if (field == "no_kv_offload") {
1731            return "nkvo";
1732        }
1733        if (field == "flash_attn") {
1734            return "fa";
1735        }
1736        if (field == "use_mmap") {
1737            return "mmap";
1738        }
1739        if (field == "use_direct_io") {
1740            return "dio";
1741        }
1742        if (field == "embeddings") {
1743            return "embd";
1744        }
1745        if (field == "no_op_offload") {
1746            return "nopo";
1747        }
1748        if (field == "no_host") {
1749            return "noh";
1750        }
1751        if (field == "devices") {
1752            return "dev";
1753        }
1754        if (field == "tensor_split") {
1755            return "ts";
1756        }
1757        if (field == "tensor_buft_overrides") {
1758            return "ot";
1759        }
1760        return field;
1761    }
1762
1763    void print_header(const cmd_params & params) override {
1764        // select fields to print
1765        fields.emplace_back("model");
1766        fields.emplace_back("size");
1767        fields.emplace_back("params");
1768        fields.emplace_back("backend");
1769        bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
1770                              test::get_backend().find("BLAS") != std::string::npos ||
1771                              test::get_backend().find("ZenDNN") != std::string::npos;
1772        if (!is_cpu_backend) {
1773            fields.emplace_back("n_gpu_layers");
1774        }
1775        if (params.n_cpu_moe.size() > 1) {
1776            fields.emplace_back("n_cpu_moe");
1777        }
1778        if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
1779            fields.emplace_back("n_threads");
1780        }
1781        if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
1782            fields.emplace_back("cpu_mask");
1783        }
1784        if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
1785            fields.emplace_back("cpu_strict");
1786        }
1787        if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
1788            fields.emplace_back("poll");
1789        }
1790        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
1791            fields.emplace_back("n_batch");
1792        }
1793        if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
1794            fields.emplace_back("n_ubatch");
1795        }
1796        if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
1797            fields.emplace_back("type_k");
1798        }
1799        if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
1800            fields.emplace_back("type_v");
1801        }
1802        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
1803            fields.emplace_back("main_gpu");
1804        }
1805        if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
1806            fields.emplace_back("split_mode");
1807        }
1808        if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
1809            fields.emplace_back("no_kv_offload");
1810        }
1811        if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
1812            fields.emplace_back("flash_attn");
1813        }
1814        if (params.devices.size() > 1 || params.devices != cmd_params_defaults.devices) {
1815            fields.emplace_back("devices");
1816        }
1817        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
1818            fields.emplace_back("tensor_split");
1819        }
1820        if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
1821            fields.emplace_back("tensor_buft_overrides");
1822        }
1823        if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
1824            fields.emplace_back("use_mmap");
1825        }
1826        if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
1827            fields.emplace_back("use_direct_io");
1828        }
1829        if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
1830            fields.emplace_back("embeddings");
1831        }
1832        if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
1833            fields.emplace_back("no_op_offload");
1834        }
1835        if (params.no_host.size() > 1 || params.no_host != cmd_params_defaults.no_host) {
1836            fields.emplace_back("no_host");
1837        }
1838        fields.emplace_back("test");
1839        fields.emplace_back("t/s");
1840
1841        fprintf(fout, "|");
1842        for (const auto & field : fields) {
1843            fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
1844        }
1845        fprintf(fout, "\n");
1846        fprintf(fout, "|");
1847        for (const auto & field : fields) {
1848            int width = get_field_width(field);
1849            fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
1850        }
1851        fprintf(fout, "\n");
1852    }
1853
1854    void print_test(const test & t) override {
1855        std::map<std::string, std::string> vmap = t.get_map();
1856
1857        fprintf(fout, "|");
1858        for (const auto & field : fields) {
1859            std::string value;
1860            char        buf[128];
1861            if (field == "model") {
1862                value = t.model_type;
1863            } else if (field == "size") {
1864                if (t.model_size < 1024 * 1024 * 1024) {
1865                    snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
1866                } else {
1867                    snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
1868                }
1869                value = buf;
1870            } else if (field == "params") {
1871                if (t.model_n_params < 1000 * 1000 * 1000) {
1872                    snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
1873                } else {
1874                    snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
1875                }
1876                value = buf;
1877            } else if (field == "backend") {
1878                value = test::get_backend();
1879            } else if (field == "test") {
1880                if (t.n_prompt > 0 && t.n_gen == 0) {
1881                    snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
1882                } else if (t.n_gen > 0 && t.n_prompt == 0) {
1883                    snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
1884                } else {
1885                    snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
1886                }
1887                if (t.n_depth > 0) {
1888                    int len = strlen(buf);
1889                    snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
1890                }
1891                value = buf;
1892            } else if (field == "t/s") {
1893                snprintf(buf, sizeof(buf), "%.2f ยฑ %.2f", t.avg_ts(), t.stdev_ts());
1894                value = buf;
1895            } else if (vmap.find(field) != vmap.end()) {
1896                value = vmap.at(field);
1897            } else {
1898                assert(false);
1899                exit(1);
1900            }
1901
1902            int width = get_field_width(field);
1903            if (field == "t/s") {
1904                // HACK: the utf-8 character is 2 bytes
1905                width += 1;
1906            }
1907            fprintf(fout, " %*s |", width, value.c_str());
1908        }
1909        fprintf(fout, "\n");
1910    }
1911
1912    void print_footer() override {
1913        fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
1914    }
1915};
1916
1917struct sql_printer : public printer {
1918    static std::string get_sql_field_type(const std::string & field) {
1919        switch (test::get_field_type(field)) {
1920            case test::STRING:
1921                return "TEXT";
1922            case test::BOOL:
1923            case test::INT:
1924                return "INTEGER";
1925            case test::FLOAT:
1926                return "REAL";
1927            default:
1928                assert(false);
1929                exit(1);
1930        }
1931    }
1932
1933    void print_header(const cmd_params & params) override {
1934        std::vector<std::string> fields = test::get_fields();
1935        fprintf(fout, "CREATE TABLE IF NOT EXISTS llama_bench (\n");
1936        for (size_t i = 0; i < fields.size(); i++) {
1937            fprintf(fout, "  %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
1938                    i < fields.size() - 1 ? "," : "");
1939        }
1940        fprintf(fout, ");\n");
1941        fprintf(fout, "\n");
1942        (void) params;
1943    }
1944
1945    void print_test(const test & t) override {
1946        fprintf(fout, "INSERT INTO llama_bench (%s) ", join(test::get_fields(), ", ").c_str());
1947        fprintf(fout, "VALUES (");
1948        std::vector<std::string> values = t.get_values();
1949        for (size_t i = 0; i < values.size(); i++) {
1950            fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
1951        }
1952        fprintf(fout, ");\n");
1953    }
1954};
1955
1956struct ctx_state {
1957    int depth = 0; // in tokens
1958
1959    std::vector<uint8_t> buf; // the llama_context state buffer
1960};
1961
1962static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
1963    llama_set_n_threads(ctx, n_threads, n_threads);
1964
1965    const llama_model * model   = llama_get_model(ctx);
1966    const llama_vocab * vocab   = llama_model_get_vocab(model);
1967    const int32_t       n_vocab = llama_vocab_n_tokens(vocab);
1968
1969    std::vector<llama_token> tokens(n_batch);
1970
1971    int n_processed = 0;
1972
1973    while (n_processed < n_prompt) {
1974        int n_tokens = std::min(n_prompt - n_processed, n_batch);
1975        tokens[0]    = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
1976        for (int i = 1; i < n_tokens; i++) {
1977            tokens[i] = std::rand() % n_vocab;
1978        }
1979        int res = llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
1980        if (res != 0) {
1981            fprintf(stderr, "%s: failed to decode prompt batch, res = %d\n", __func__, res);
1982            return false;
1983        }
1984        n_processed += n_tokens;
1985    }
1986
1987    llama_synchronize(ctx);
1988    return true;
1989}
1990
1991static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
1992    llama_set_n_threads(ctx, n_threads, n_threads);
1993
1994    const llama_model * model   = llama_get_model(ctx);
1995    const llama_vocab * vocab   = llama_model_get_vocab(model);
1996    const int32_t       n_vocab = llama_vocab_n_tokens(vocab);
1997
1998    llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
1999
2000    for (int i = 0; i < n_gen; i++) {
2001        int res = llama_decode(ctx, llama_batch_get_one(&token, 1));
2002        if (res != 0) {
2003            fprintf(stderr, "%s: failed to decode generation batch, res = %d\n", __func__, res);
2004            return false;
2005        }
2006        llama_synchronize(ctx);
2007        token = std::rand() % n_vocab;
2008    }
2009    return true;
2010}
2011
2012static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
2013    (void) level;
2014    (void) text;
2015    (void) user_data;
2016}
2017
2018static std::unique_ptr<printer> create_printer(output_formats format) {
2019    switch (format) {
2020        case NONE:
2021            return nullptr;
2022        case CSV:
2023            return std::unique_ptr<printer>(new csv_printer());
2024        case JSON:
2025            return std::unique_ptr<printer>(new json_printer());
2026        case JSONL:
2027            return std::unique_ptr<printer>(new jsonl_printer());
2028        case MARKDOWN:
2029            return std::unique_ptr<printer>(new markdown_printer());
2030        case SQL:
2031            return std::unique_ptr<printer>(new sql_printer());
2032    }
2033    GGML_ABORT("fatal error");
2034}
2035
2036int main(int argc, char ** argv) {
2037    // try to set locale for unicode characters in markdown
2038    setlocale(LC_CTYPE, ".UTF-8");
2039
2040#if !defined(NDEBUG)
2041    fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
2042#endif
2043
2044#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
2045    fprintf(stderr, "warning: debug build, performance may be affected\n");
2046#endif
2047
2048#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
2049    fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
2050#endif
2051
2052    // initialize backends
2053    ggml_backend_load_all();
2054
2055    cmd_params params = parse_cmd_params(argc, argv);
2056
2057    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2058    if (!cpu_dev) {
2059        fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
2060        return 1;
2061    }
2062    auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
2063    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
2064    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
2065
2066    // initialize llama.cpp
2067    if (!params.verbose) {
2068        llama_log_set(llama_null_log_callback, NULL);
2069    }
2070    llama_backend_init();
2071    llama_numa_init(params.numa);
2072
2073    if (!set_process_priority(params.prio)) {
2074        fprintf(stderr, "%s: error: failed to set process priority\n", __func__);
2075        return 1;
2076    }
2077
2078    // initialize printer
2079    std::unique_ptr<printer> p     = create_printer(params.output_format);
2080    std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
2081
2082    if (p) {
2083        p->fout = stdout;
2084        p->print_header(params);
2085    }
2086
2087    if (p_err) {
2088        p_err->fout = stderr;
2089        p_err->print_header(params);
2090    }
2091
2092    std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
2093
2094    llama_model *               lmodel    = nullptr;
2095    const cmd_params_instance * prev_inst = nullptr;
2096
2097    // store the llama_context state at the previous depth that we performed a test
2098    // ref: https://github.com/ggml-org/llama.cpp/pull/16944#issuecomment-3478151721
2099    ctx_state cstate;
2100
2101    int  params_idx   = 0;
2102    auto params_count = params_instances.size();
2103    for (const auto & inst : params_instances) {
2104        params_idx++;
2105        if (params.progress) {
2106            fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
2107        }
2108        // keep the same model between tests when possible
2109        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
2110            if (lmodel) {
2111                llama_model_free(lmodel);
2112            }
2113
2114            lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
2115            if (lmodel == NULL) {
2116                fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
2117                return 1;
2118            }
2119            prev_inst = &inst;
2120        }
2121
2122        llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
2123        if (ctx == NULL) {
2124            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
2125            llama_model_free(lmodel);
2126            return 1;
2127        }
2128
2129        test t(inst, lmodel, ctx);
2130
2131        llama_memory_clear(llama_get_memory(ctx), false);
2132
2133        // cool off before the test
2134        if (params.delay) {
2135            std::this_thread::sleep_for(std::chrono::seconds(params.delay));
2136        }
2137
2138        struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
2139        if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
2140            fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
2141            llama_free(ctx);
2142            llama_model_free(lmodel);
2143            exit(1);
2144        }
2145        tpp.strict_cpu = t.cpu_strict;
2146        tpp.poll       = t.poll;
2147        tpp.prio       = params.prio;
2148
2149        struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
2150        if (!threadpool) {
2151            fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
2152            llama_free(ctx);
2153            llama_model_free(lmodel);
2154            exit(1);
2155        }
2156
2157        llama_attach_threadpool(ctx, threadpool, NULL);
2158
2159        // warmup run
2160        if (!params.no_warmup) {
2161            if (t.n_prompt > 0) {
2162                if (params.progress) {
2163                    fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
2164                }
2165                //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
2166                bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
2167                if (!res) {
2168                    fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
2169                    llama_free(ctx);
2170                    llama_model_free(lmodel);
2171                    exit(1);
2172                }
2173            }
2174            if (t.n_gen > 0) {
2175                if (params.progress) {
2176                    fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
2177                }
2178                bool res = test_gen(ctx, 1, t.n_threads);
2179                if (!res) {
2180                    fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
2181                    llama_free(ctx);
2182                    llama_model_free(lmodel);
2183                    exit(1);
2184                }
2185            }
2186        }
2187
2188        for (int i = 0; i < params.reps; i++) {
2189            llama_memory_clear(llama_get_memory(ctx), false);
2190
2191            if (t.n_depth > 0) {
2192                bool is_cached = t.n_depth == cstate.depth;
2193
2194                if (is_cached) {
2195                    // if previously we have computed at this depth, just restore the state
2196                    const size_t ret = llama_state_seq_set_data(ctx, cstate.buf.data(), cstate.buf.size(), 0);
2197                    if (ret == 0) {
2198                        // if the old state is incompatible with the current context - reprocess from scratch
2199                        is_cached = false;
2200                    }
2201                }
2202
2203                if (!is_cached) {
2204                    if (params.progress) {
2205                        fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
2206                                i + 1, params.reps);
2207                    }
2208                    bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
2209                    if (!res) {
2210                        fprintf(stderr, "%s: error: failed to run depth\n", __func__);
2211                        llama_free(ctx);
2212                        llama_model_free(lmodel);
2213                        exit(1);
2214                    }
2215
2216                    // store the context state for reuse in later runs
2217                    cstate.depth = t.n_depth;
2218                    cstate.buf.resize(llama_state_seq_get_size(ctx, 0));
2219                    llama_state_seq_get_data(ctx, cstate.buf.data(), cstate.buf.size(), 0);
2220                } else {
2221                    if (params.progress) {
2222                        fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d (cached)\n", params_idx, params_count,
2223                                i + 1, params.reps);
2224                    }
2225                }
2226            }
2227
2228            uint64_t t_start = get_time_ns();
2229
2230            if (t.n_prompt > 0) {
2231                if (params.progress) {
2232                    fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
2233                            i + 1, params.reps);
2234                }
2235                bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
2236                if (!res) {
2237                    fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
2238                    llama_free(ctx);
2239                    llama_model_free(lmodel);
2240                    exit(1);
2241                }
2242            }
2243            if (t.n_gen > 0) {
2244                if (params.progress) {
2245                    fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
2246                            i + 1, params.reps);
2247                }
2248                bool res = test_gen(ctx, t.n_gen, t.n_threads);
2249                if (!res) {
2250                    fprintf(stderr, "%s: error: failed to run gen\n", __func__);
2251                    llama_free(ctx);
2252                    llama_model_free(lmodel);
2253                    exit(1);
2254                }
2255            }
2256
2257            uint64_t t_ns = get_time_ns() - t_start;
2258            t.samples_ns.push_back(t_ns);
2259        }
2260
2261        if (p) {
2262            p->print_test(t);
2263            fflush(p->fout);
2264        }
2265
2266        if (p_err) {
2267            p_err->print_test(t);
2268            fflush(p_err->fout);
2269        }
2270
2271        llama_perf_context_print(ctx);
2272
2273        llama_free(ctx);
2274
2275        ggml_threadpool_free_fn(threadpool);
2276    }
2277
2278    llama_model_free(lmodel);
2279
2280    if (p) {
2281        p->print_footer();
2282    }
2283
2284    if (p_err) {
2285        p_err->print_footer();
2286    }
2287
2288    llama_backend_free();
2289
2290    return 0;
2291}