1#include <algorithm>
2#include <array>
3#include <cassert>
4#include <chrono>
5#include <cinttypes>
6#include <clocale>
7#include <cmath>
8#include <cstdio>
9#include <cstdlib>
10#include <cstring>
11#include <ctime>
12#include <iterator>
13#include <map>
14#include <numeric>
15#include <regex>
16#include <sstream>
17#include <string>
18#include <thread>
19#include <vector>
20#include <unordered_set>
21
22#include "common.h"
23#include "ggml.h"
24#include "llama.h"
25
26#ifdef _WIN32
27# define WIN32_LEAN_AND_MEAN
28# ifndef NOMINMAX
29# define NOMINMAX
30# endif
31# include <windows.h>
32#endif
33
34// utils
35static uint64_t get_time_ns() {
36 using clock = std::chrono::high_resolution_clock;
37 return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
38}
39
40static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
41 if (a.pattern != b.pattern) {
42 // cString comparison that may be null
43 if (a.pattern == nullptr || b.pattern == nullptr) {
44 return false;
45 }
46 if (strcmp(a.pattern, b.pattern) != 0) {
47 return false;
48 }
49 }
50 if (a.buft != b.buft) {
51 return false;
52 }
53 return true;
54}
55
56static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
57 if (a.size() != b.size()) {
58 return false;
59 }
60 for (size_t i = 0; i < a.size(); i++) {
61 if (!tensor_buft_override_equal(a[i], b[i])) {
62 return false;
63 }
64 }
65 return true;
66}
67
68static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
69 if (a.size() != b.size()) {
70 return false;
71 }
72 for (size_t i = 0; i < a.size(); i++) {
73 if (!vec_tensor_buft_override_equal(a[i], b[i])) {
74 return false;
75 }
76 }
77 return true;
78}
79
80template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
81 std::ostringstream str;
82 for (size_t i = 0; i < values.size(); i++) {
83 str << values[i];
84 if (i < values.size() - 1) {
85 str << delim;
86 }
87 }
88 return str.str();
89}
90
91template <typename T, typename F> static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
92 std::vector<std::string> str_values;
93 std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
94 return str_values;
95}
96
97template <typename T> static T avg(const std::vector<T> & v) {
98 if (v.empty()) {
99 return 0;
100 }
101 T sum = std::accumulate(v.begin(), v.end(), T(0));
102 return sum / (T) v.size();
103}
104
105template <typename T> static T stdev(const std::vector<T> & v) {
106 if (v.size() <= 1) {
107 return 0;
108 }
109 T mean = avg(v);
110 T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
111 T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1));
112 return stdev;
113}
114
115static std::string get_cpu_info() {
116 std::vector<std::string> cpu_list;
117 for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
118 auto * dev = ggml_backend_dev_get(i);
119 auto dev_type = ggml_backend_dev_type(dev);
120 if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
121 cpu_list.push_back(ggml_backend_dev_description(dev));
122 }
123 }
124 return join(cpu_list, ", ");
125}
126
127static std::string get_gpu_info() {
128 std::vector<std::string> gpu_list;
129 for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
130 auto * dev = ggml_backend_dev_get(i);
131 auto dev_type = ggml_backend_dev_type(dev);
132 if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU || dev_type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
133 gpu_list.push_back(ggml_backend_dev_description(dev));
134 }
135 }
136 return join(gpu_list, ", ");
137}
138
139static std::vector<ggml_backend_dev_t> parse_devices_arg(const std::string & value) {
140 std::vector<ggml_backend_dev_t> devices;
141 std::string trimmed = string_strip(value);
142 if (trimmed.empty()) {
143 throw std::invalid_argument("no devices specified");
144 }
145 if (trimmed == "auto") {
146 return devices;
147 }
148
149 auto dev_names = string_split<std::string>(trimmed, '/');
150 if (dev_names.size() == 1 && string_strip(dev_names[0]) == "none") {
151 devices.push_back(nullptr);
152 return devices;
153 }
154
155 for (auto & name : dev_names) {
156 std::string dev_name = string_strip(name);
157 if (dev_name.empty()) {
158 throw std::invalid_argument("invalid device specification");
159 }
160 auto * dev = ggml_backend_dev_by_name(dev_name.c_str());
161 if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
162 throw std::invalid_argument(string_format("invalid device: %s", dev_name.c_str()));
163 }
164 devices.push_back(dev);
165 }
166
167 devices.push_back(nullptr);
168 return devices;
169}
170
171static void register_rpc_server_list(const std::string & servers) {
172 auto rpc_servers = string_split<std::string>(servers, ',');
173 if (rpc_servers.empty()) {
174 throw std::invalid_argument("no RPC servers specified");
175 }
176
177 auto * rpc_reg = ggml_backend_reg_by_name("RPC");
178 if (!rpc_reg) {
179 throw std::invalid_argument("failed to find RPC backend");
180 }
181
182 using add_rpc_server_fn = ggml_backend_reg_t (*)(const char * endpoint);
183 auto * ggml_backend_rpc_add_server_fn = (add_rpc_server_fn) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
184 if (!ggml_backend_rpc_add_server_fn) {
185 throw std::invalid_argument("failed to find RPC add server function");
186 }
187 for (const auto & server : rpc_servers) {
188 auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
189 ggml_backend_register(reg);
190 }
191}
192
193static std::string devices_to_string(const std::vector<ggml_backend_dev_t> & devices) {
194 if (devices.empty()) {
195 return "auto";
196 }
197
198 if (devices.size() == 1 && devices[0] == nullptr) {
199 return "none";
200 }
201
202 std::vector<std::string> names;
203 for (auto * dev : devices) {
204 if (dev == nullptr) {
205 break;
206 }
207 names.push_back(ggml_backend_dev_name(dev));
208 }
209
210 return join(names, "/");
211}
212
213// command line params
214enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
215
216static const char * output_format_str(output_formats format) {
217 switch (format) {
218 case NONE:
219 return "none";
220 case CSV:
221 return "csv";
222 case JSON:
223 return "json";
224 case JSONL:
225 return "jsonl";
226 case MARKDOWN:
227 return "md";
228 case SQL:
229 return "sql";
230 default:
231 GGML_ABORT("invalid output format");
232 }
233}
234
235static bool output_format_from_str(const std::string & s, output_formats & format) {
236 if (s == "none") {
237 format = NONE;
238 } else if (s == "csv") {
239 format = CSV;
240 } else if (s == "json") {
241 format = JSON;
242 } else if (s == "jsonl") {
243 format = JSONL;
244 } else if (s == "md") {
245 format = MARKDOWN;
246 } else if (s == "sql") {
247 format = SQL;
248 } else {
249 return false;
250 }
251 return true;
252}
253
254static const char * split_mode_str(llama_split_mode mode) {
255 switch (mode) {
256 case LLAMA_SPLIT_MODE_NONE:
257 return "none";
258 case LLAMA_SPLIT_MODE_LAYER:
259 return "layer";
260 case LLAMA_SPLIT_MODE_ROW:
261 return "row";
262 default:
263 GGML_ABORT("invalid split mode");
264 }
265}
266
267static std::string pair_str(const std::pair<int, int> & p) {
268 static char buf[32];
269 snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
270 return buf;
271}
272
273static std::vector<int> parse_int_range(const std::string & s) {
274 // first[-last[(+|*)step]]
275 std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
276
277 std::smatch match;
278 std::string::const_iterator search_start(s.cbegin());
279 std::vector<int> result;
280 while (std::regex_search(search_start, s.cend(), match, range_regex)) {
281 int first = std::stoi(match[1]);
282 int last = match[2].matched ? std::stoi(match[2]) : first;
283 char op = match[3].matched ? match[3].str()[0] : '+';
284 int step = match[4].matched ? std::stoi(match[4]) : 1;
285
286 for (int i = first; i <= last;) {
287 result.push_back(i);
288
289 int prev_i = i;
290
291 if (op == '+') {
292 i += step;
293 } else if (op == '*') {
294 i *= step;
295 } else {
296 throw std::invalid_argument("invalid range format");
297 }
298
299 if (i <= prev_i) {
300 throw std::invalid_argument("invalid range");
301 }
302 }
303 search_start = match.suffix().first;
304 }
305
306 if (search_start != s.cend()) {
307 throw std::invalid_argument("invalid range format");
308 }
309
310 return result;
311}
312
313struct cmd_params {
314 std::vector<std::string> model;
315 std::vector<int> n_prompt;
316 std::vector<int> n_gen;
317 std::vector<std::pair<int, int>> n_pg;
318 std::vector<int> n_depth;
319 std::vector<int> n_batch;
320 std::vector<int> n_ubatch;
321 std::vector<ggml_type> type_k;
322 std::vector<ggml_type> type_v;
323 std::vector<int> n_threads;
324 std::vector<std::string> cpu_mask;
325 std::vector<bool> cpu_strict;
326 std::vector<int> poll;
327 std::vector<int> n_gpu_layers;
328 std::vector<int> n_cpu_moe;
329 std::vector<llama_split_mode> split_mode;
330 std::vector<int> main_gpu;
331 std::vector<bool> no_kv_offload;
332 std::vector<bool> flash_attn;
333 std::vector<std::vector<ggml_backend_dev_t>> devices;
334 std::vector<std::vector<float>> tensor_split;
335 std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
336 std::vector<bool> use_mmap;
337 std::vector<bool> use_direct_io;
338 std::vector<bool> embeddings;
339 std::vector<bool> no_op_offload;
340 std::vector<bool> no_host;
341 ggml_numa_strategy numa;
342 int reps;
343 ggml_sched_priority prio;
344 int delay;
345 bool verbose;
346 bool progress;
347 bool no_warmup;
348 output_formats output_format;
349 output_formats output_format_stderr;
350};
351
352static const cmd_params cmd_params_defaults = {
353 /* model */ { "models/7B/ggml-model-q4_0.gguf" },
354 /* n_prompt */ { 512 },
355 /* n_gen */ { 128 },
356 /* n_pg */ {},
357 /* n_depth */ { 0 },
358 /* n_batch */ { 2048 },
359 /* n_ubatch */ { 512 },
360 /* type_k */ { GGML_TYPE_F16 },
361 /* type_v */ { GGML_TYPE_F16 },
362 /* n_threads */ { cpu_get_num_math() },
363 /* cpu_mask */ { "0x0" },
364 /* cpu_strict */ { false },
365 /* poll */ { 50 },
366 /* n_gpu_layers */ { 99 },
367 /* n_cpu_moe */ { 0 },
368 /* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
369 /* main_gpu */ { 0 },
370 /* no_kv_offload */ { false },
371 /* flash_attn */ { false },
372 /* devices */ { {} },
373 /* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
374 /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
375 /* use_mmap */ { false },
376 /* use_direct_io */ { false },
377 /* embeddings */ { false },
378 /* no_op_offload */ { false },
379 /* no_host */ { false },
380 /* numa */ GGML_NUMA_STRATEGY_DISABLED,
381 /* reps */ 5,
382 /* prio */ GGML_SCHED_PRIO_NORMAL,
383 /* delay */ 0,
384 /* verbose */ false,
385 /* progress */ false,
386 /* no_warmup */ false,
387 /* output_format */ MARKDOWN,
388 /* output_format_stderr */ NONE,
389};
390
391static void print_usage(int /* argc */, char ** argv) {
392 printf("usage: %s [options]\n", argv[0]);
393 printf("\n");
394 printf("options:\n");
395 printf(" -h, --help\n");
396 printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
397 printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
398 cmd_params_defaults.reps);
399 printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n",
400 cmd_params_defaults.prio);
401 printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
402 cmd_params_defaults.delay);
403 printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
404 output_format_str(cmd_params_defaults.output_format));
405 printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
406 output_format_str(cmd_params_defaults.output_format_stderr));
407 printf(" --list-devices list available devices and exit\n");
408 printf(" -v, --verbose verbose output\n");
409 printf(" --progress print test progress indicators\n");
410 printf(" --no-warmup skip warmup runs before benchmarking\n");
411 if (llama_supports_rpc()) {
412 printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n");
413 }
414 printf("\n");
415 printf("test parameters:\n");
416 printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
417 printf(" -p, --n-prompt <n> (default: %s)\n",
418 join(cmd_params_defaults.n_prompt, ",").c_str());
419 printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
420 printf(" -pg <pp,tg> (default: %s)\n",
421 join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
422 printf(" -d, --n-depth <n> (default: %s)\n",
423 join(cmd_params_defaults.n_depth, ",").c_str());
424 printf(" -b, --batch-size <n> (default: %s)\n",
425 join(cmd_params_defaults.n_batch, ",").c_str());
426 printf(" -ub, --ubatch-size <n> (default: %s)\n",
427 join(cmd_params_defaults.n_ubatch, ",").c_str());
428 printf(" -ctk, --cache-type-k <t> (default: %s)\n",
429 join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
430 printf(" -ctv, --cache-type-v <t> (default: %s)\n",
431 join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
432 printf(" -t, --threads <n> (default: %s)\n",
433 join(cmd_params_defaults.n_threads, ",").c_str());
434 printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
435 join(cmd_params_defaults.cpu_mask, ",").c_str());
436 printf(" --cpu-strict <0|1> (default: %s)\n",
437 join(cmd_params_defaults.cpu_strict, ",").c_str());
438 printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
439 printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
440 join(cmd_params_defaults.n_gpu_layers, ",").c_str());
441 printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
442 join(cmd_params_defaults.n_cpu_moe, ",").c_str());
443 printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
444 join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
445 printf(" -mg, --main-gpu <i> (default: %s)\n",
446 join(cmd_params_defaults.main_gpu, ",").c_str());
447 printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n",
448 join(cmd_params_defaults.no_kv_offload, ",").c_str());
449 printf(" -fa, --flash-attn <0|1> (default: %s)\n",
450 join(cmd_params_defaults.flash_attn, ",").c_str());
451 printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
452 printf(" -mmp, --mmap <0|1> (default: %s)\n",
453 join(cmd_params_defaults.use_mmap, ",").c_str());
454 printf(" -dio, --direct-io <0|1> (default: %s)\n",
455 join(cmd_params_defaults.use_direct_io, ",").c_str());
456 printf(" -embd, --embeddings <0|1> (default: %s)\n",
457 join(cmd_params_defaults.embeddings, ",").c_str());
458 printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
459 printf(" -ot --override-tensor <tensor name pattern>=<buffer type>;...\n");
460 printf(" (default: disabled)\n");
461 printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
462 printf(" --no-host <0|1> (default: %s)\n",
463 join(cmd_params_defaults.no_host, ",").c_str());
464 printf("\n");
465 printf(
466 "Multiple values can be given for each parameter by separating them with ','\n"
467 "or by specifying the parameter multiple times. Ranges can be given as\n"
468 "'first-last' or 'first-last+step' or 'first-last*mult'.\n");
469}
470
471static ggml_type ggml_type_from_name(const std::string & s) {
472 if (s == "f16") {
473 return GGML_TYPE_F16;
474 }
475 if (s == "bf16") {
476 return GGML_TYPE_BF16;
477 }
478 if (s == "q8_0") {
479 return GGML_TYPE_Q8_0;
480 }
481 if (s == "q4_0") {
482 return GGML_TYPE_Q4_0;
483 }
484 if (s == "q4_1") {
485 return GGML_TYPE_Q4_1;
486 }
487 if (s == "q5_0") {
488 return GGML_TYPE_Q5_0;
489 }
490 if (s == "q5_1") {
491 return GGML_TYPE_Q5_1;
492 }
493 if (s == "iq4_nl") {
494 return GGML_TYPE_IQ4_NL;
495 }
496
497 return GGML_TYPE_COUNT;
498}
499
500static cmd_params parse_cmd_params(int argc, char ** argv) {
501 cmd_params params;
502 std::string arg;
503 bool invalid_param = false;
504 const std::string arg_prefix = "--";
505 const char split_delim = ',';
506
507 params.verbose = cmd_params_defaults.verbose;
508 params.output_format = cmd_params_defaults.output_format;
509 params.output_format_stderr = cmd_params_defaults.output_format_stderr;
510 params.reps = cmd_params_defaults.reps;
511 params.numa = cmd_params_defaults.numa;
512 params.prio = cmd_params_defaults.prio;
513 params.delay = cmd_params_defaults.delay;
514 params.progress = cmd_params_defaults.progress;
515 params.no_warmup = cmd_params_defaults.no_warmup;
516
517 for (int i = 1; i < argc; i++) {
518 arg = argv[i];
519 if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
520 std::replace(arg.begin(), arg.end(), '_', '-');
521 }
522
523 try {
524 if (arg == "-h" || arg == "--help") {
525 print_usage(argc, argv);
526 exit(0);
527 } else if (arg == "-m" || arg == "--model") {
528 if (++i >= argc) {
529 invalid_param = true;
530 break;
531 }
532 auto p = string_split<std::string>(argv[i], split_delim);
533 params.model.insert(params.model.end(), p.begin(), p.end());
534 } else if (arg == "-p" || arg == "--n-prompt") {
535 if (++i >= argc) {
536 invalid_param = true;
537 break;
538 }
539 auto p = parse_int_range(argv[i]);
540 params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
541 } else if (arg == "-n" || arg == "--n-gen") {
542 if (++i >= argc) {
543 invalid_param = true;
544 break;
545 }
546 auto p = parse_int_range(argv[i]);
547 params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
548 } else if (arg == "-pg") {
549 if (++i >= argc) {
550 invalid_param = true;
551 break;
552 }
553 auto p = string_split<std::string>(argv[i], ',');
554 if (p.size() != 2) {
555 invalid_param = true;
556 break;
557 }
558 params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
559 } else if (arg == "-d" || arg == "--n-depth") {
560 if (++i >= argc) {
561 invalid_param = true;
562 break;
563 }
564 auto p = parse_int_range(argv[i]);
565 params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
566 } else if (arg == "-b" || arg == "--batch-size") {
567 if (++i >= argc) {
568 invalid_param = true;
569 break;
570 }
571 auto p = parse_int_range(argv[i]);
572 params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
573 } else if (arg == "-ub" || arg == "--ubatch-size") {
574 if (++i >= argc) {
575 invalid_param = true;
576 break;
577 }
578 auto p = parse_int_range(argv[i]);
579 params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
580 } else if (arg == "-ctk" || arg == "--cache-type-k") {
581 if (++i >= argc) {
582 invalid_param = true;
583 break;
584 }
585 auto p = string_split<std::string>(argv[i], split_delim);
586
587 std::vector<ggml_type> types;
588 for (const auto & t : p) {
589 ggml_type gt = ggml_type_from_name(t);
590 if (gt == GGML_TYPE_COUNT) {
591 invalid_param = true;
592 break;
593 }
594 types.push_back(gt);
595 }
596 if (invalid_param) {
597 break;
598 }
599 params.type_k.insert(params.type_k.end(), types.begin(), types.end());
600 } else if (arg == "-ctv" || arg == "--cache-type-v") {
601 if (++i >= argc) {
602 invalid_param = true;
603 break;
604 }
605 auto p = string_split<std::string>(argv[i], split_delim);
606
607 std::vector<ggml_type> types;
608 for (const auto & t : p) {
609 ggml_type gt = ggml_type_from_name(t);
610 if (gt == GGML_TYPE_COUNT) {
611 invalid_param = true;
612 break;
613 }
614 types.push_back(gt);
615 }
616 if (invalid_param) {
617 break;
618 }
619 params.type_v.insert(params.type_v.end(), types.begin(), types.end());
620 } else if (arg == "-dev" || arg == "--device") {
621 if (++i >= argc) {
622 invalid_param = true;
623 break;
624 }
625 auto combos = string_split<std::string>(argv[i], split_delim);
626 for (const auto & combo : combos) {
627 try {
628 params.devices.push_back(parse_devices_arg(combo));
629 } catch (const std::exception & e) {
630 fprintf(stderr, "error: %s\n", e.what());
631 invalid_param = true;
632 break;
633 }
634 }
635 if (invalid_param) {
636 break;
637 }
638 } else if (arg == "--list-devices") {
639 std::vector<ggml_backend_dev_t> devices;
640 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
641 auto * dev = ggml_backend_dev_get(i);
642 if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
643 devices.push_back(dev);
644 }
645 }
646 printf("Available devices:\n");
647 if (devices.empty()) {
648 printf(" (none)\n");
649 }
650 for (auto * dev : devices) {
651 size_t free, total;
652 ggml_backend_dev_memory(dev, &free, &total);
653 printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
654 }
655 exit(0);
656 } else if (arg == "-t" || arg == "--threads") {
657 if (++i >= argc) {
658 invalid_param = true;
659 break;
660 }
661 auto p = parse_int_range(argv[i]);
662 params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
663 } else if (arg == "-C" || arg == "--cpu-mask") {
664 if (++i >= argc) {
665 invalid_param = true;
666 break;
667 }
668 auto p = string_split<std::string>(argv[i], split_delim);
669 params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
670 } else if (arg == "--cpu-strict") {
671 if (++i >= argc) {
672 invalid_param = true;
673 break;
674 }
675 auto p = string_split<bool>(argv[i], split_delim);
676 params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
677 } else if (arg == "--poll") {
678 if (++i >= argc) {
679 invalid_param = true;
680 break;
681 }
682 auto p = parse_int_range(argv[i]);
683 params.poll.insert(params.poll.end(), p.begin(), p.end());
684 } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
685 if (++i >= argc) {
686 invalid_param = true;
687 break;
688 }
689 auto p = parse_int_range(argv[i]);
690 params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
691 } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") {
692 if (++i >= argc) {
693 invalid_param = true;
694 break;
695 }
696 auto p = parse_int_range(argv[i]);
697 params.n_cpu_moe.insert(params.n_cpu_moe.end(), p.begin(), p.end());
698 } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
699 if (++i >= argc) {
700 invalid_param = true;
701 break;
702 }
703 try {
704 register_rpc_server_list(argv[i]);
705 } catch (const std::exception & e) {
706 fprintf(stderr, "error: %s\n", e.what());
707 invalid_param = true;
708 break;
709 }
710 } else if (arg == "-sm" || arg == "--split-mode") {
711 if (++i >= argc) {
712 invalid_param = true;
713 break;
714 }
715 auto p = string_split<std::string>(argv[i], split_delim);
716
717 std::vector<llama_split_mode> modes;
718 for (const auto & m : p) {
719 llama_split_mode mode;
720 if (m == "none") {
721 mode = LLAMA_SPLIT_MODE_NONE;
722 } else if (m == "layer") {
723 mode = LLAMA_SPLIT_MODE_LAYER;
724 } else if (m == "row") {
725 mode = LLAMA_SPLIT_MODE_ROW;
726 } else {
727 invalid_param = true;
728 break;
729 }
730 modes.push_back(mode);
731 }
732 if (invalid_param) {
733 break;
734 }
735 params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
736 } else if (arg == "-mg" || arg == "--main-gpu") {
737 if (++i >= argc) {
738 invalid_param = true;
739 break;
740 }
741 params.main_gpu = parse_int_range(argv[i]);
742 } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
743 if (++i >= argc) {
744 invalid_param = true;
745 break;
746 }
747 auto p = string_split<bool>(argv[i], split_delim);
748 params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
749 } else if (arg == "--numa") {
750 if (++i >= argc) {
751 invalid_param = true;
752 break;
753 }
754 std::string value(argv[i]);
755 if (value == "distribute" || value == "") {
756 params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
757 } else if (value == "isolate") {
758 params.numa = GGML_NUMA_STRATEGY_ISOLATE;
759 } else if (value == "numactl") {
760 params.numa = GGML_NUMA_STRATEGY_NUMACTL;
761 } else {
762 invalid_param = true;
763 break;
764 }
765 } else if (arg == "-fa" || arg == "--flash-attn") {
766 if (++i >= argc) {
767 invalid_param = true;
768 break;
769 }
770 auto p = string_split<bool>(argv[i], split_delim);
771 params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
772 } else if (arg == "-mmp" || arg == "--mmap") {
773 if (++i >= argc) {
774 invalid_param = true;
775 break;
776 }
777 auto p = string_split<bool>(argv[i], split_delim);
778 params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
779 } else if (arg == "-dio" || arg == "--direct-io") {
780 if (++i >= argc) {
781 invalid_param = true;
782 break;
783 }
784 auto p = string_split<bool>(argv[i], split_delim);
785 params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
786 } else if (arg == "-embd" || arg == "--embeddings") {
787 if (++i >= argc) {
788 invalid_param = true;
789 break;
790 }
791 auto p = string_split<bool>(argv[i], split_delim);
792 params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
793 } else if (arg == "-nopo" || arg == "--no-op-offload") {
794 if (++i >= argc) {
795 invalid_param = true;
796 break;
797 }
798 auto p = string_split<bool>(argv[i], split_delim);
799 params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
800 } else if (arg == "--no-host") {
801 if (++i >= argc) {
802 invalid_param = true;
803 break;
804 }
805 auto p = string_split<bool>(argv[i], split_delim);
806 params.no_host.insert(params.no_host.end(), p.begin(), p.end());
807 } else if (arg == "-ts" || arg == "--tensor-split") {
808 if (++i >= argc) {
809 invalid_param = true;
810 break;
811 }
812 for (auto ts : string_split<std::string>(argv[i], split_delim)) {
813 // split string by ; and /
814 const std::regex regex{ R"([;/]+)" };
815 std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
816 std::vector<std::string> split_arg{ it, {} };
817 GGML_ASSERT(split_arg.size() <= llama_max_devices());
818
819 std::vector<float> tensor_split(llama_max_devices());
820 for (size_t i = 0; i < llama_max_devices(); ++i) {
821 if (i < split_arg.size()) {
822 tensor_split[i] = std::stof(split_arg[i]);
823 } else {
824 tensor_split[i] = 0.0f;
825 }
826 }
827 params.tensor_split.push_back(tensor_split);
828 }
829 } else if (arg == "-ot" || arg == "--override-tensor") {
830 if (++i >= argc) {
831 invalid_param = true;
832 break;
833 }
834 auto * value = argv[i];
835 /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
836 if (buft_list.empty()) {
837 // enumerate all the devices and add their buffer types to the list
838 for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
839 auto * dev = ggml_backend_dev_get(i);
840 auto * buft = ggml_backend_dev_buffer_type(dev);
841 if (buft) {
842 buft_list[ggml_backend_buft_name(buft)] = buft;
843 }
844 }
845 }
846 auto override_group_span_len = std::strcspn(value, ",");
847 bool last_group = false;
848 do {
849 if (override_group_span_len == 0) {
850 // Adds an empty override-tensors for an empty span
851 params.tensor_buft_overrides.push_back({{}});
852 if (value[override_group_span_len] == '\0') {
853 value = &value[override_group_span_len];
854 last_group = true;
855 } else {
856 value = &value[override_group_span_len + 1];
857 override_group_span_len = std::strcspn(value, ",");
858 }
859 continue;
860 }
861 // Stamps null terminators into the argv
862 // value for this option to avoid the
863 // memory leak present in the implementation
864 // over in arg.cpp. Acceptable because we
865 // only parse these args once in this program.
866 auto * override_group = value;
867 if (value[override_group_span_len] == '\0') {
868 value = &value[override_group_span_len];
869 last_group = true;
870 } else {
871 value[override_group_span_len] = '\0';
872 value = &value[override_group_span_len + 1];
873 }
874 std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
875 auto override_span_len = std::strcspn(override_group, ";");
876 while (override_span_len > 0) {
877 auto * override = override_group;
878 if (override_group[override_span_len] != '\0') {
879 override_group[override_span_len] = '\0';
880 override_group = &override_group[override_span_len + 1];
881 } else {
882 override_group = &override_group[override_span_len];
883 }
884 auto tensor_name_span_len = std::strcspn(override, "=");
885 if (tensor_name_span_len >= override_span_len) {
886 invalid_param = true;
887 break;
888 }
889 override[tensor_name_span_len] = '\0';
890 auto * tensor_name = override;
891 auto * buffer_type = &override[tensor_name_span_len + 1];
892 if (buft_list.find(buffer_type) == buft_list.end()) {
893 printf("error: unrecognized buffer type '%s'\n", buffer_type);
894 printf("Available buffer types:\n");
895 for (const auto & it : buft_list) {
896 printf(" %s\n", ggml_backend_buft_name(it.second));
897 }
898 invalid_param = true;
899 break;
900 }
901 group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
902 override_span_len = std::strcspn(override_group, ";");
903 }
904 if (invalid_param) {
905 break;
906 }
907 group_tensor_buft_overrides.push_back({nullptr,nullptr});
908 params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
909 override_group_span_len = std::strcspn(value, ",");
910 } while (!last_group);
911 } else if (arg == "-r" || arg == "--repetitions") {
912 if (++i >= argc) {
913 invalid_param = true;
914 break;
915 }
916 params.reps = std::stoi(argv[i]);
917 } else if (arg == "--prio") {
918 if (++i >= argc) {
919 invalid_param = true;
920 break;
921 }
922 params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
923 } else if (arg == "--delay") {
924 if (++i >= argc) {
925 invalid_param = true;
926 break;
927 }
928 params.delay = std::stoi(argv[i]);
929 } else if (arg == "-o" || arg == "--output") {
930 if (++i >= argc) {
931 invalid_param = true;
932 break;
933 }
934 invalid_param = !output_format_from_str(argv[i], params.output_format);
935 } else if (arg == "-oe" || arg == "--output-err") {
936 if (++i >= argc) {
937 invalid_param = true;
938 break;
939 }
940 invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
941 } else if (arg == "-v" || arg == "--verbose") {
942 params.verbose = true;
943 } else if (arg == "--progress") {
944 params.progress = true;
945 } else if (arg == "--no-warmup") {
946 params.no_warmup = true;
947 } else {
948 invalid_param = true;
949 break;
950 }
951 } catch (const std::exception & e) {
952 fprintf(stderr, "error: %s\n", e.what());
953 invalid_param = true;
954 break;
955 }
956 }
957
958 if (invalid_param) {
959 fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
960 print_usage(argc, argv);
961 exit(1);
962 }
963
964 // set defaults
965 if (params.model.empty()) {
966 params.model = cmd_params_defaults.model;
967 }
968 if (params.n_prompt.empty()) {
969 params.n_prompt = cmd_params_defaults.n_prompt;
970 }
971 if (params.n_gen.empty()) {
972 params.n_gen = cmd_params_defaults.n_gen;
973 }
974 if (params.n_pg.empty()) {
975 params.n_pg = cmd_params_defaults.n_pg;
976 }
977 if (params.n_depth.empty()) {
978 params.n_depth = cmd_params_defaults.n_depth;
979 }
980 if (params.n_batch.empty()) {
981 params.n_batch = cmd_params_defaults.n_batch;
982 }
983 if (params.n_ubatch.empty()) {
984 params.n_ubatch = cmd_params_defaults.n_ubatch;
985 }
986 if (params.type_k.empty()) {
987 params.type_k = cmd_params_defaults.type_k;
988 }
989 if (params.type_v.empty()) {
990 params.type_v = cmd_params_defaults.type_v;
991 }
992 if (params.n_gpu_layers.empty()) {
993 params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
994 }
995 if (params.n_cpu_moe.empty()) {
996 params.n_cpu_moe = cmd_params_defaults.n_cpu_moe;
997 }
998 if (params.split_mode.empty()) {
999 params.split_mode = cmd_params_defaults.split_mode;
1000 }
1001 if (params.main_gpu.empty()) {
1002 params.main_gpu = cmd_params_defaults.main_gpu;
1003 }
1004 if (params.no_kv_offload.empty()) {
1005 params.no_kv_offload = cmd_params_defaults.no_kv_offload;
1006 }
1007 if (params.flash_attn.empty()) {
1008 params.flash_attn = cmd_params_defaults.flash_attn;
1009 }
1010 if (params.devices.empty()) {
1011 params.devices = cmd_params_defaults.devices;
1012 }
1013 if (params.tensor_split.empty()) {
1014 params.tensor_split = cmd_params_defaults.tensor_split;
1015 }
1016 if (params.tensor_buft_overrides.empty()) {
1017 params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
1018 }
1019 if (params.use_mmap.empty()) {
1020 params.use_mmap = cmd_params_defaults.use_mmap;
1021 }
1022 if (params.use_direct_io.empty()) {
1023 params.use_direct_io = cmd_params_defaults.use_direct_io;
1024 }
1025 if (params.embeddings.empty()) {
1026 params.embeddings = cmd_params_defaults.embeddings;
1027 }
1028 if (params.no_op_offload.empty()) {
1029 params.no_op_offload = cmd_params_defaults.no_op_offload;
1030 }
1031 if (params.no_host.empty()) {
1032 params.no_host = cmd_params_defaults.no_host;
1033 }
1034 if (params.n_threads.empty()) {
1035 params.n_threads = cmd_params_defaults.n_threads;
1036 }
1037 if (params.cpu_mask.empty()) {
1038 params.cpu_mask = cmd_params_defaults.cpu_mask;
1039 }
1040 if (params.cpu_strict.empty()) {
1041 params.cpu_strict = cmd_params_defaults.cpu_strict;
1042 }
1043 if (params.poll.empty()) {
1044 params.poll = cmd_params_defaults.poll;
1045 }
1046
1047 return params;
1048}
1049
1050struct cmd_params_instance {
1051 std::string model;
1052 int n_prompt;
1053 int n_gen;
1054 int n_depth;
1055 int n_batch;
1056 int n_ubatch;
1057 ggml_type type_k;
1058 ggml_type type_v;
1059 int n_threads;
1060 std::string cpu_mask;
1061 bool cpu_strict;
1062 int poll;
1063 int n_gpu_layers;
1064 int n_cpu_moe;
1065 llama_split_mode split_mode;
1066 int main_gpu;
1067 bool no_kv_offload;
1068 bool flash_attn;
1069 std::vector<ggml_backend_dev_t> devices;
1070 std::vector<float> tensor_split;
1071 std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
1072 bool use_mmap;
1073 bool use_direct_io;
1074 bool embeddings;
1075 bool no_op_offload;
1076 bool no_host;
1077
1078 llama_model_params to_llama_mparams() const {
1079 llama_model_params mparams = llama_model_default_params();
1080
1081 mparams.n_gpu_layers = n_gpu_layers;
1082 if (!devices.empty()) {
1083 mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
1084 }
1085 mparams.split_mode = split_mode;
1086 mparams.main_gpu = main_gpu;
1087 mparams.tensor_split = tensor_split.data();
1088 mparams.use_mmap = use_mmap;
1089 mparams.use_direct_io = use_direct_io;
1090 mparams.no_host = no_host;
1091
1092 if (n_cpu_moe <= 0) {
1093 if (tensor_buft_overrides.empty()) {
1094 mparams.tensor_buft_overrides = nullptr;
1095 } else {
1096 GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr &&
1097 "Tensor buffer overrides not terminated with empty pattern");
1098 mparams.tensor_buft_overrides = tensor_buft_overrides.data();
1099 }
1100 } else {
1101 static std::vector<llama_model_tensor_buft_override> merged;
1102 static std::vector<std::string> patterns;
1103
1104 merged.clear();
1105 patterns.clear();
1106
1107 auto first = tensor_buft_overrides.begin();
1108 auto last = tensor_buft_overrides.end();
1109 if (first != last && (last - 1)->pattern == nullptr) {
1110 --last;
1111 }
1112 merged.insert(merged.end(), first, last);
1113
1114 patterns.reserve((size_t) n_cpu_moe);
1115 merged.reserve(merged.size() + (size_t) n_cpu_moe + 1);
1116
1117 for (int i = 0; i < n_cpu_moe; ++i) {
1118 patterns.push_back(llm_ffn_exps_block_regex(i));
1119 merged.push_back({ patterns.back().c_str(),
1120 ggml_backend_cpu_buffer_type() });
1121 }
1122
1123 merged.push_back({ nullptr, nullptr });
1124
1125 mparams.tensor_buft_overrides = merged.data();
1126 }
1127
1128 return mparams;
1129 }
1130
1131 bool equal_mparams(const cmd_params_instance & other) const {
1132 return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
1133 split_mode == other.split_mode &&
1134 main_gpu == other.main_gpu && tensor_split == other.tensor_split &&
1135 use_mmap == other.use_mmap && use_direct_io == other.use_direct_io &&
1136 devices == other.devices &&
1137 no_host == other.no_host &&
1138 vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
1139 }
1140
1141 llama_context_params to_llama_cparams() const {
1142 llama_context_params cparams = llama_context_default_params();
1143
1144 cparams.n_ctx = n_prompt + n_gen + n_depth;
1145 cparams.n_batch = n_batch;
1146 cparams.n_ubatch = n_ubatch;
1147 cparams.type_k = type_k;
1148 cparams.type_v = type_v;
1149 cparams.offload_kqv = !no_kv_offload;
1150 cparams.flash_attn_type = flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
1151 cparams.embeddings = embeddings;
1152 cparams.op_offload = !no_op_offload;
1153 cparams.swa_full = false;
1154
1155 return cparams;
1156 }
1157};
1158
1159static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
1160 std::vector<cmd_params_instance> instances;
1161
1162 // this ordering minimizes the number of times that each model needs to be reloaded
1163 // clang-format off
1164 for (const auto & m : params.model)
1165 for (const auto & nl : params.n_gpu_layers)
1166 for (const auto & ncmoe : params.n_cpu_moe)
1167 for (const auto & sm : params.split_mode)
1168 for (const auto & mg : params.main_gpu)
1169 for (const auto & devs : params.devices)
1170 for (const auto & ts : params.tensor_split)
1171 for (const auto & ot : params.tensor_buft_overrides)
1172 for (const auto & mmp : params.use_mmap)
1173 for (const auto & dio : params.use_direct_io)
1174 for (const auto & noh : params.no_host)
1175 for (const auto & embd : params.embeddings)
1176 for (const auto & nopo : params.no_op_offload)
1177 for (const auto & nb : params.n_batch)
1178 for (const auto & nub : params.n_ubatch)
1179 for (const auto & tk : params.type_k)
1180 for (const auto & tv : params.type_v)
1181 for (const auto & nkvo : params.no_kv_offload)
1182 for (const auto & fa : params.flash_attn)
1183 for (const auto & nt : params.n_threads)
1184 for (const auto & cm : params.cpu_mask)
1185 for (const auto & cs : params.cpu_strict)
1186 for (const auto & nd : params.n_depth)
1187 for (const auto & pl : params.poll) {
1188 for (const auto & n_prompt : params.n_prompt) {
1189 if (n_prompt == 0) {
1190 continue;
1191 }
1192 cmd_params_instance instance = {
1193 /* .model = */ m,
1194 /* .n_prompt = */ n_prompt,
1195 /* .n_gen = */ 0,
1196 /* .n_depth = */ nd,
1197 /* .n_batch = */ nb,
1198 /* .n_ubatch = */ nub,
1199 /* .type_k = */ tk,
1200 /* .type_v = */ tv,
1201 /* .n_threads = */ nt,
1202 /* .cpu_mask = */ cm,
1203 /* .cpu_strict = */ cs,
1204 /* .poll = */ pl,
1205 /* .n_gpu_layers = */ nl,
1206 /* .n_cpu_moe = */ ncmoe,
1207 /* .split_mode = */ sm,
1208 /* .main_gpu = */ mg,
1209 /* .no_kv_offload= */ nkvo,
1210 /* .flash_attn = */ fa,
1211 /* .devices = */ devs,
1212 /* .tensor_split = */ ts,
1213 /* .tensor_buft_overrides = */ ot,
1214 /* .use_mmap = */ mmp,
1215 /* .use_direct_io= */ dio,
1216 /* .embeddings = */ embd,
1217 /* .no_op_offload= */ nopo,
1218 /* .no_host = */ noh,
1219 };
1220 instances.push_back(instance);
1221 }
1222
1223 for (const auto & n_gen : params.n_gen) {
1224 if (n_gen == 0) {
1225 continue;
1226 }
1227 cmd_params_instance instance = {
1228 /* .model = */ m,
1229 /* .n_prompt = */ 0,
1230 /* .n_gen = */ n_gen,
1231 /* .n_depth = */ nd,
1232 /* .n_batch = */ nb,
1233 /* .n_ubatch = */ nub,
1234 /* .type_k = */ tk,
1235 /* .type_v = */ tv,
1236 /* .n_threads = */ nt,
1237 /* .cpu_mask = */ cm,
1238 /* .cpu_strict = */ cs,
1239 /* .poll = */ pl,
1240 /* .n_gpu_layers = */ nl,
1241 /* .n_cpu_moe = */ ncmoe,
1242 /* .split_mode = */ sm,
1243 /* .main_gpu = */ mg,
1244 /* .no_kv_offload= */ nkvo,
1245 /* .flash_attn = */ fa,
1246 /* .devices = */ devs,
1247 /* .tensor_split = */ ts,
1248 /* .tensor_buft_overrides = */ ot,
1249 /* .use_mmap = */ mmp,
1250 /* .use_direct_io= */ dio,
1251 /* .embeddings = */ embd,
1252 /* .no_op_offload= */ nopo,
1253 /* .no_host = */ noh,
1254 };
1255 instances.push_back(instance);
1256 }
1257
1258 for (const auto & n_pg : params.n_pg) {
1259 if (n_pg.first == 0 && n_pg.second == 0) {
1260 continue;
1261 }
1262 cmd_params_instance instance = {
1263 /* .model = */ m,
1264 /* .n_prompt = */ n_pg.first,
1265 /* .n_gen = */ n_pg.second,
1266 /* .n_depth = */ nd,
1267 /* .n_batch = */ nb,
1268 /* .n_ubatch = */ nub,
1269 /* .type_k = */ tk,
1270 /* .type_v = */ tv,
1271 /* .n_threads = */ nt,
1272 /* .cpu_mask = */ cm,
1273 /* .cpu_strict = */ cs,
1274 /* .poll = */ pl,
1275 /* .n_gpu_layers = */ nl,
1276 /* .n_cpu_moe = */ ncmoe,
1277 /* .split_mode = */ sm,
1278 /* .main_gpu = */ mg,
1279 /* .no_kv_offload= */ nkvo,
1280 /* .flash_attn = */ fa,
1281 /* .devices = */ devs,
1282 /* .tensor_split = */ ts,
1283 /* .tensor_buft_overrides = */ ot,
1284 /* .use_mmap = */ mmp,
1285 /* .use_direct_io= */ dio,
1286 /* .embeddings = */ embd,
1287 /* .no_op_offload= */ nopo,
1288 /* .no_host = */ noh,
1289 };
1290 instances.push_back(instance);
1291 }
1292 }
1293 // clang-format on
1294
1295 return instances;
1296}
1297
1298struct test {
1299 static const std::string build_commit;
1300 static const int build_number;
1301 const std::string cpu_info;
1302 const std::string gpu_info;
1303 std::string model_filename;
1304 std::string model_type;
1305 uint64_t model_size;
1306 uint64_t model_n_params;
1307 int n_batch;
1308 int n_ubatch;
1309 int n_threads;
1310 std::string cpu_mask;
1311 bool cpu_strict;
1312 int poll;
1313 ggml_type type_k;
1314 ggml_type type_v;
1315 int n_gpu_layers;
1316 int n_cpu_moe;
1317 llama_split_mode split_mode;
1318 int main_gpu;
1319 bool no_kv_offload;
1320 bool flash_attn;
1321 std::vector<ggml_backend_dev_t> devices;
1322 std::vector<float> tensor_split;
1323 std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
1324 bool use_mmap;
1325 bool use_direct_io;
1326 bool embeddings;
1327 bool no_op_offload;
1328 bool no_host;
1329 int n_prompt;
1330 int n_gen;
1331 int n_depth;
1332 std::string test_time;
1333 std::vector<uint64_t> samples_ns;
1334
1335 test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
1336 cpu_info(get_cpu_info()),
1337 gpu_info(get_gpu_info()) {
1338
1339 model_filename = inst.model;
1340 char buf[128];
1341 llama_model_desc(lmodel, buf, sizeof(buf));
1342 model_type = buf;
1343 model_size = llama_model_size(lmodel);
1344 model_n_params = llama_model_n_params(lmodel);
1345 n_batch = inst.n_batch;
1346 n_ubatch = inst.n_ubatch;
1347 n_threads = inst.n_threads;
1348 cpu_mask = inst.cpu_mask;
1349 cpu_strict = inst.cpu_strict;
1350 poll = inst.poll;
1351 type_k = inst.type_k;
1352 type_v = inst.type_v;
1353 n_gpu_layers = inst.n_gpu_layers;
1354 n_cpu_moe = inst.n_cpu_moe;
1355 split_mode = inst.split_mode;
1356 main_gpu = inst.main_gpu;
1357 no_kv_offload = inst.no_kv_offload;
1358 flash_attn = inst.flash_attn;
1359 devices = inst.devices;
1360 tensor_split = inst.tensor_split;
1361 tensor_buft_overrides = inst.tensor_buft_overrides;
1362 use_mmap = inst.use_mmap;
1363 use_direct_io = inst.use_direct_io;
1364 embeddings = inst.embeddings;
1365 no_op_offload = inst.no_op_offload;
1366 no_host = inst.no_host;
1367 n_prompt = inst.n_prompt;
1368 n_gen = inst.n_gen;
1369 n_depth = inst.n_depth;
1370 // RFC 3339 date-time format
1371 time_t t = time(NULL);
1372 std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
1373 test_time = buf;
1374
1375 (void) ctx;
1376 }
1377
1378 uint64_t avg_ns() const { return ::avg(samples_ns); }
1379
1380 uint64_t stdev_ns() const { return ::stdev(samples_ns); }
1381
1382 std::vector<double> get_ts() const {
1383 int n_tokens = n_prompt + n_gen;
1384 std::vector<double> ts;
1385 std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
1386 [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
1387 return ts;
1388 }
1389
1390 double avg_ts() const { return ::avg(get_ts()); }
1391
1392 double stdev_ts() const { return ::stdev(get_ts()); }
1393
1394 static std::string get_backend() {
1395 std::vector<std::string> backends;
1396 bool rpc_used = false;
1397 for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
1398 auto * reg = ggml_backend_reg_get(i);
1399 std::string name = ggml_backend_reg_name(reg);
1400 if (string_starts_with(name, "RPC")) {
1401 if (ggml_backend_reg_dev_count(reg) > 0) {
1402 rpc_used = true;
1403 }
1404 } else {
1405 if (name != "CPU") {
1406 backends.push_back(ggml_backend_reg_name(reg));
1407 }
1408 }
1409 }
1410 if (rpc_used) {
1411 backends.push_back("RPC");
1412 }
1413 return backends.empty() ? "CPU" : join(backends, ",");
1414 }
1415
1416 static const std::vector<std::string> & get_fields() {
1417 static const std::vector<std::string> fields = {
1418 "build_commit", "build_number", "cpu_info", "gpu_info", "backends",
1419 "model_filename", "model_type", "model_size", "model_n_params", "n_batch",
1420 "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
1421 "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
1422 "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split",
1423 "tensor_buft_overrides", "use_mmap", "use_direct_io", "embeddings",
1424 "no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth",
1425 "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts"
1426 };
1427 return fields;
1428 }
1429
1430 enum field_type { STRING, BOOL, INT, FLOAT };
1431
1432 static field_type get_field_type(const std::string & field) {
1433 if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
1434 field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
1435 field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" ||
1436 field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") {
1437 return INT;
1438 }
1439 if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
1440 field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") {
1441 return BOOL;
1442 }
1443 if (field == "avg_ts" || field == "stddev_ts") {
1444 return FLOAT;
1445 }
1446 return STRING;
1447 }
1448
1449 std::vector<std::string> get_values() const {
1450 std::string tensor_split_str;
1451 std::string tensor_buft_overrides_str;
1452 int max_nonzero = 0;
1453 for (size_t i = 0; i < llama_max_devices(); i++) {
1454 if (tensor_split[i] > 0) {
1455 max_nonzero = i;
1456 }
1457 }
1458 for (int i = 0; i <= max_nonzero; i++) {
1459 char buf[32];
1460 snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
1461 tensor_split_str += buf;
1462 if (i < max_nonzero) {
1463 tensor_split_str += "/";
1464 }
1465 }
1466 if (tensor_buft_overrides.size() == 1) {
1467 // Last element of tensor_buft_overrides is always a null pattern
1468 // so if it is only one element long, it must be a null pattern.
1469 GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
1470 tensor_buft_overrides_str += "none";
1471 } else {
1472 for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
1473 // Last element of tensor_buft_overrides is always a null pattern
1474 if (tensor_buft_overrides[i].pattern == nullptr) {
1475 tensor_buft_overrides_str += "none";
1476 } else {
1477 tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
1478 tensor_buft_overrides_str += "=";
1479 tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
1480 }
1481 if (i + 2 < tensor_buft_overrides.size()) {
1482 tensor_buft_overrides_str += ";";
1483 }
1484 }
1485 }
1486 std::vector<std::string> values = { build_commit,
1487 std::to_string(build_number),
1488 cpu_info,
1489 gpu_info,
1490 get_backend(),
1491 model_filename,
1492 model_type,
1493 std::to_string(model_size),
1494 std::to_string(model_n_params),
1495 std::to_string(n_batch),
1496 std::to_string(n_ubatch),
1497 std::to_string(n_threads),
1498 cpu_mask,
1499 std::to_string(cpu_strict),
1500 std::to_string(poll),
1501 ggml_type_name(type_k),
1502 ggml_type_name(type_v),
1503 std::to_string(n_gpu_layers),
1504 std::to_string(n_cpu_moe),
1505 split_mode_str(split_mode),
1506 std::to_string(main_gpu),
1507 std::to_string(no_kv_offload),
1508 std::to_string(flash_attn),
1509 devices_to_string(devices),
1510 tensor_split_str,
1511 tensor_buft_overrides_str,
1512 std::to_string(use_mmap),
1513 std::to_string(use_direct_io),
1514 std::to_string(embeddings),
1515 std::to_string(no_op_offload),
1516 std::to_string(no_host),
1517 std::to_string(n_prompt),
1518 std::to_string(n_gen),
1519 std::to_string(n_depth),
1520 test_time,
1521 std::to_string(avg_ns()),
1522 std::to_string(stdev_ns()),
1523 std::to_string(avg_ts()),
1524 std::to_string(stdev_ts()) };
1525 return values;
1526 }
1527
1528 std::map<std::string, std::string> get_map() const {
1529 std::map<std::string, std::string> map;
1530 auto fields = get_fields();
1531 auto values = get_values();
1532 std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()),
1533 std::make_pair<const std::string &, const std::string &>);
1534 return map;
1535 }
1536};
1537
1538const std::string test::build_commit = LLAMA_COMMIT;
1539const int test::build_number = LLAMA_BUILD_NUMBER;
1540
1541struct printer {
1542 virtual ~printer() {}
1543
1544 FILE * fout;
1545
1546 virtual void print_header(const cmd_params & params) { (void) params; }
1547
1548 virtual void print_test(const test & t) = 0;
1549
1550 virtual void print_footer() {}
1551};
1552
1553struct csv_printer : public printer {
1554 static std::string escape_csv(const std::string & field) {
1555 std::string escaped = "\"";
1556 for (auto c : field) {
1557 if (c == '"') {
1558 escaped += "\"";
1559 }
1560 escaped += c;
1561 }
1562 escaped += "\"";
1563 return escaped;
1564 }
1565
1566 void print_header(const cmd_params & params) override {
1567 std::vector<std::string> fields = test::get_fields();
1568 fprintf(fout, "%s\n", join(fields, ",").c_str());
1569 (void) params;
1570 }
1571
1572 void print_test(const test & t) override {
1573 std::vector<std::string> values = t.get_values();
1574 std::transform(values.begin(), values.end(), values.begin(), escape_csv);
1575 fprintf(fout, "%s\n", join(values, ",").c_str());
1576 }
1577};
1578
1579static std::string escape_json(const std::string & value) {
1580 std::string escaped;
1581 for (auto c : value) {
1582 if (c == '"') {
1583 escaped += "\\\"";
1584 } else if (c == '\\') {
1585 escaped += "\\\\";
1586 } else if (c <= 0x1f) {
1587 char buf[8];
1588 snprintf(buf, sizeof(buf), "\\u%04x", c);
1589 escaped += buf;
1590 } else {
1591 escaped += c;
1592 }
1593 }
1594 return escaped;
1595}
1596
1597static std::string format_json_value(const std::string & field, const std::string & value) {
1598 switch (test::get_field_type(field)) {
1599 case test::STRING:
1600 return "\"" + escape_json(value) + "\"";
1601 case test::BOOL:
1602 return value == "0" ? "false" : "true";
1603 default:
1604 return value;
1605 }
1606}
1607
1608struct json_printer : public printer {
1609 bool first = true;
1610
1611 void print_header(const cmd_params & params) override {
1612 fprintf(fout, "[\n");
1613 (void) params;
1614 }
1615
1616 void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1617 assert(fields.size() == values.size());
1618 for (size_t i = 0; i < fields.size(); i++) {
1619 fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(),
1620 format_json_value(fields.at(i), values.at(i)).c_str());
1621 }
1622 }
1623
1624 void print_test(const test & t) override {
1625 if (first) {
1626 first = false;
1627 } else {
1628 fprintf(fout, ",\n");
1629 }
1630 fprintf(fout, " {\n");
1631 print_fields(test::get_fields(), t.get_values());
1632 fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
1633 fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
1634 fprintf(fout, " }");
1635 fflush(fout);
1636 }
1637
1638 void print_footer() override { fprintf(fout, "\n]\n"); }
1639};
1640
1641struct jsonl_printer : public printer {
1642 void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
1643 assert(fields.size() == values.size());
1644 for (size_t i = 0; i < fields.size(); i++) {
1645 fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
1646 }
1647 }
1648
1649 void print_test(const test & t) override {
1650 fprintf(fout, "{");
1651 print_fields(test::get_fields(), t.get_values());
1652 fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
1653 fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
1654 fprintf(fout, "}\n");
1655 fflush(fout);
1656 }
1657};
1658
1659struct markdown_printer : public printer {
1660 std::vector<std::string> fields;
1661
1662 static int get_field_width(const std::string & field) {
1663 if (field == "model") {
1664 return -30;
1665 }
1666 if (field == "t/s") {
1667 return 20;
1668 }
1669 if (field == "size" || field == "params") {
1670 return 10;
1671 }
1672 if (field == "n_gpu_layers") {
1673 return 3;
1674 }
1675 if (field == "n_threads") {
1676 return 7;
1677 }
1678 if (field == "n_batch") {
1679 return 7;
1680 }
1681 if (field == "n_ubatch") {
1682 return 8;
1683 }
1684 if (field == "type_k" || field == "type_v") {
1685 return 6;
1686 }
1687 if (field == "split_mode") {
1688 return 5;
1689 }
1690 if (field == "flash_attn") {
1691 return 2;
1692 }
1693 if (field == "devices") {
1694 return -12;
1695 }
1696 if (field == "use_mmap") {
1697 return 4;
1698 }
1699 if (field == "use_direct_io") {
1700 return 3;
1701 }
1702 if (field == "test") {
1703 return 15;
1704 }
1705 if (field == "no_op_offload") {
1706 return 4;
1707 }
1708 if (field == "no_host") {
1709 return 4;
1710 }
1711
1712 int width = std::max((int) field.length(), 10);
1713
1714 if (test::get_field_type(field) == test::STRING) {
1715 return -width;
1716 }
1717 return width;
1718 }
1719
1720 static std::string get_field_display_name(const std::string & field) {
1721 if (field == "n_gpu_layers") {
1722 return "ngl";
1723 }
1724 if (field == "split_mode") {
1725 return "sm";
1726 }
1727 if (field == "n_threads") {
1728 return "threads";
1729 }
1730 if (field == "no_kv_offload") {
1731 return "nkvo";
1732 }
1733 if (field == "flash_attn") {
1734 return "fa";
1735 }
1736 if (field == "use_mmap") {
1737 return "mmap";
1738 }
1739 if (field == "use_direct_io") {
1740 return "dio";
1741 }
1742 if (field == "embeddings") {
1743 return "embd";
1744 }
1745 if (field == "no_op_offload") {
1746 return "nopo";
1747 }
1748 if (field == "no_host") {
1749 return "noh";
1750 }
1751 if (field == "devices") {
1752 return "dev";
1753 }
1754 if (field == "tensor_split") {
1755 return "ts";
1756 }
1757 if (field == "tensor_buft_overrides") {
1758 return "ot";
1759 }
1760 return field;
1761 }
1762
1763 void print_header(const cmd_params & params) override {
1764 // select fields to print
1765 fields.emplace_back("model");
1766 fields.emplace_back("size");
1767 fields.emplace_back("params");
1768 fields.emplace_back("backend");
1769 bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
1770 test::get_backend().find("BLAS") != std::string::npos ||
1771 test::get_backend().find("ZenDNN") != std::string::npos;
1772 if (!is_cpu_backend) {
1773 fields.emplace_back("n_gpu_layers");
1774 }
1775 if (params.n_cpu_moe.size() > 1) {
1776 fields.emplace_back("n_cpu_moe");
1777 }
1778 if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
1779 fields.emplace_back("n_threads");
1780 }
1781 if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
1782 fields.emplace_back("cpu_mask");
1783 }
1784 if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
1785 fields.emplace_back("cpu_strict");
1786 }
1787 if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
1788 fields.emplace_back("poll");
1789 }
1790 if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
1791 fields.emplace_back("n_batch");
1792 }
1793 if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
1794 fields.emplace_back("n_ubatch");
1795 }
1796 if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
1797 fields.emplace_back("type_k");
1798 }
1799 if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
1800 fields.emplace_back("type_v");
1801 }
1802 if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
1803 fields.emplace_back("main_gpu");
1804 }
1805 if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
1806 fields.emplace_back("split_mode");
1807 }
1808 if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
1809 fields.emplace_back("no_kv_offload");
1810 }
1811 if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
1812 fields.emplace_back("flash_attn");
1813 }
1814 if (params.devices.size() > 1 || params.devices != cmd_params_defaults.devices) {
1815 fields.emplace_back("devices");
1816 }
1817 if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
1818 fields.emplace_back("tensor_split");
1819 }
1820 if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
1821 fields.emplace_back("tensor_buft_overrides");
1822 }
1823 if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
1824 fields.emplace_back("use_mmap");
1825 }
1826 if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
1827 fields.emplace_back("use_direct_io");
1828 }
1829 if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
1830 fields.emplace_back("embeddings");
1831 }
1832 if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
1833 fields.emplace_back("no_op_offload");
1834 }
1835 if (params.no_host.size() > 1 || params.no_host != cmd_params_defaults.no_host) {
1836 fields.emplace_back("no_host");
1837 }
1838 fields.emplace_back("test");
1839 fields.emplace_back("t/s");
1840
1841 fprintf(fout, "|");
1842 for (const auto & field : fields) {
1843 fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
1844 }
1845 fprintf(fout, "\n");
1846 fprintf(fout, "|");
1847 for (const auto & field : fields) {
1848 int width = get_field_width(field);
1849 fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
1850 }
1851 fprintf(fout, "\n");
1852 }
1853
1854 void print_test(const test & t) override {
1855 std::map<std::string, std::string> vmap = t.get_map();
1856
1857 fprintf(fout, "|");
1858 for (const auto & field : fields) {
1859 std::string value;
1860 char buf[128];
1861 if (field == "model") {
1862 value = t.model_type;
1863 } else if (field == "size") {
1864 if (t.model_size < 1024 * 1024 * 1024) {
1865 snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
1866 } else {
1867 snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
1868 }
1869 value = buf;
1870 } else if (field == "params") {
1871 if (t.model_n_params < 1000 * 1000 * 1000) {
1872 snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
1873 } else {
1874 snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
1875 }
1876 value = buf;
1877 } else if (field == "backend") {
1878 value = test::get_backend();
1879 } else if (field == "test") {
1880 if (t.n_prompt > 0 && t.n_gen == 0) {
1881 snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
1882 } else if (t.n_gen > 0 && t.n_prompt == 0) {
1883 snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
1884 } else {
1885 snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
1886 }
1887 if (t.n_depth > 0) {
1888 int len = strlen(buf);
1889 snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
1890 }
1891 value = buf;
1892 } else if (field == "t/s") {
1893 snprintf(buf, sizeof(buf), "%.2f ยฑ %.2f", t.avg_ts(), t.stdev_ts());
1894 value = buf;
1895 } else if (vmap.find(field) != vmap.end()) {
1896 value = vmap.at(field);
1897 } else {
1898 assert(false);
1899 exit(1);
1900 }
1901
1902 int width = get_field_width(field);
1903 if (field == "t/s") {
1904 // HACK: the utf-8 character is 2 bytes
1905 width += 1;
1906 }
1907 fprintf(fout, " %*s |", width, value.c_str());
1908 }
1909 fprintf(fout, "\n");
1910 }
1911
1912 void print_footer() override {
1913 fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
1914 }
1915};
1916
1917struct sql_printer : public printer {
1918 static std::string get_sql_field_type(const std::string & field) {
1919 switch (test::get_field_type(field)) {
1920 case test::STRING:
1921 return "TEXT";
1922 case test::BOOL:
1923 case test::INT:
1924 return "INTEGER";
1925 case test::FLOAT:
1926 return "REAL";
1927 default:
1928 assert(false);
1929 exit(1);
1930 }
1931 }
1932
1933 void print_header(const cmd_params & params) override {
1934 std::vector<std::string> fields = test::get_fields();
1935 fprintf(fout, "CREATE TABLE IF NOT EXISTS llama_bench (\n");
1936 for (size_t i = 0; i < fields.size(); i++) {
1937 fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
1938 i < fields.size() - 1 ? "," : "");
1939 }
1940 fprintf(fout, ");\n");
1941 fprintf(fout, "\n");
1942 (void) params;
1943 }
1944
1945 void print_test(const test & t) override {
1946 fprintf(fout, "INSERT INTO llama_bench (%s) ", join(test::get_fields(), ", ").c_str());
1947 fprintf(fout, "VALUES (");
1948 std::vector<std::string> values = t.get_values();
1949 for (size_t i = 0; i < values.size(); i++) {
1950 fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
1951 }
1952 fprintf(fout, ");\n");
1953 }
1954};
1955
1956struct ctx_state {
1957 int depth = 0; // in tokens
1958
1959 std::vector<uint8_t> buf; // the llama_context state buffer
1960};
1961
1962static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
1963 llama_set_n_threads(ctx, n_threads, n_threads);
1964
1965 const llama_model * model = llama_get_model(ctx);
1966 const llama_vocab * vocab = llama_model_get_vocab(model);
1967 const int32_t n_vocab = llama_vocab_n_tokens(vocab);
1968
1969 std::vector<llama_token> tokens(n_batch);
1970
1971 int n_processed = 0;
1972
1973 while (n_processed < n_prompt) {
1974 int n_tokens = std::min(n_prompt - n_processed, n_batch);
1975 tokens[0] = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
1976 for (int i = 1; i < n_tokens; i++) {
1977 tokens[i] = std::rand() % n_vocab;
1978 }
1979 int res = llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
1980 if (res != 0) {
1981 fprintf(stderr, "%s: failed to decode prompt batch, res = %d\n", __func__, res);
1982 return false;
1983 }
1984 n_processed += n_tokens;
1985 }
1986
1987 llama_synchronize(ctx);
1988 return true;
1989}
1990
1991static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
1992 llama_set_n_threads(ctx, n_threads, n_threads);
1993
1994 const llama_model * model = llama_get_model(ctx);
1995 const llama_vocab * vocab = llama_model_get_vocab(model);
1996 const int32_t n_vocab = llama_vocab_n_tokens(vocab);
1997
1998 llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
1999
2000 for (int i = 0; i < n_gen; i++) {
2001 int res = llama_decode(ctx, llama_batch_get_one(&token, 1));
2002 if (res != 0) {
2003 fprintf(stderr, "%s: failed to decode generation batch, res = %d\n", __func__, res);
2004 return false;
2005 }
2006 llama_synchronize(ctx);
2007 token = std::rand() % n_vocab;
2008 }
2009 return true;
2010}
2011
2012static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
2013 (void) level;
2014 (void) text;
2015 (void) user_data;
2016}
2017
2018static std::unique_ptr<printer> create_printer(output_formats format) {
2019 switch (format) {
2020 case NONE:
2021 return nullptr;
2022 case CSV:
2023 return std::unique_ptr<printer>(new csv_printer());
2024 case JSON:
2025 return std::unique_ptr<printer>(new json_printer());
2026 case JSONL:
2027 return std::unique_ptr<printer>(new jsonl_printer());
2028 case MARKDOWN:
2029 return std::unique_ptr<printer>(new markdown_printer());
2030 case SQL:
2031 return std::unique_ptr<printer>(new sql_printer());
2032 }
2033 GGML_ABORT("fatal error");
2034}
2035
2036int main(int argc, char ** argv) {
2037 // try to set locale for unicode characters in markdown
2038 setlocale(LC_CTYPE, ".UTF-8");
2039
2040#if !defined(NDEBUG)
2041 fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
2042#endif
2043
2044#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
2045 fprintf(stderr, "warning: debug build, performance may be affected\n");
2046#endif
2047
2048#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
2049 fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
2050#endif
2051
2052 // initialize backends
2053 ggml_backend_load_all();
2054
2055 cmd_params params = parse_cmd_params(argc, argv);
2056
2057 auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2058 if (!cpu_dev) {
2059 fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
2060 return 1;
2061 }
2062 auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
2063 auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
2064 auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
2065
2066 // initialize llama.cpp
2067 if (!params.verbose) {
2068 llama_log_set(llama_null_log_callback, NULL);
2069 }
2070 llama_backend_init();
2071 llama_numa_init(params.numa);
2072
2073 if (!set_process_priority(params.prio)) {
2074 fprintf(stderr, "%s: error: failed to set process priority\n", __func__);
2075 return 1;
2076 }
2077
2078 // initialize printer
2079 std::unique_ptr<printer> p = create_printer(params.output_format);
2080 std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
2081
2082 if (p) {
2083 p->fout = stdout;
2084 p->print_header(params);
2085 }
2086
2087 if (p_err) {
2088 p_err->fout = stderr;
2089 p_err->print_header(params);
2090 }
2091
2092 std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
2093
2094 llama_model * lmodel = nullptr;
2095 const cmd_params_instance * prev_inst = nullptr;
2096
2097 // store the llama_context state at the previous depth that we performed a test
2098 // ref: https://github.com/ggml-org/llama.cpp/pull/16944#issuecomment-3478151721
2099 ctx_state cstate;
2100
2101 int params_idx = 0;
2102 auto params_count = params_instances.size();
2103 for (const auto & inst : params_instances) {
2104 params_idx++;
2105 if (params.progress) {
2106 fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
2107 }
2108 // keep the same model between tests when possible
2109 if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
2110 if (lmodel) {
2111 llama_model_free(lmodel);
2112 }
2113
2114 lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
2115 if (lmodel == NULL) {
2116 fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
2117 return 1;
2118 }
2119 prev_inst = &inst;
2120 }
2121
2122 llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
2123 if (ctx == NULL) {
2124 fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
2125 llama_model_free(lmodel);
2126 return 1;
2127 }
2128
2129 test t(inst, lmodel, ctx);
2130
2131 llama_memory_clear(llama_get_memory(ctx), false);
2132
2133 // cool off before the test
2134 if (params.delay) {
2135 std::this_thread::sleep_for(std::chrono::seconds(params.delay));
2136 }
2137
2138 struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
2139 if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
2140 fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
2141 llama_free(ctx);
2142 llama_model_free(lmodel);
2143 exit(1);
2144 }
2145 tpp.strict_cpu = t.cpu_strict;
2146 tpp.poll = t.poll;
2147 tpp.prio = params.prio;
2148
2149 struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
2150 if (!threadpool) {
2151 fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
2152 llama_free(ctx);
2153 llama_model_free(lmodel);
2154 exit(1);
2155 }
2156
2157 llama_attach_threadpool(ctx, threadpool, NULL);
2158
2159 // warmup run
2160 if (!params.no_warmup) {
2161 if (t.n_prompt > 0) {
2162 if (params.progress) {
2163 fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
2164 }
2165 //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
2166 bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
2167 if (!res) {
2168 fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
2169 llama_free(ctx);
2170 llama_model_free(lmodel);
2171 exit(1);
2172 }
2173 }
2174 if (t.n_gen > 0) {
2175 if (params.progress) {
2176 fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
2177 }
2178 bool res = test_gen(ctx, 1, t.n_threads);
2179 if (!res) {
2180 fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
2181 llama_free(ctx);
2182 llama_model_free(lmodel);
2183 exit(1);
2184 }
2185 }
2186 }
2187
2188 for (int i = 0; i < params.reps; i++) {
2189 llama_memory_clear(llama_get_memory(ctx), false);
2190
2191 if (t.n_depth > 0) {
2192 bool is_cached = t.n_depth == cstate.depth;
2193
2194 if (is_cached) {
2195 // if previously we have computed at this depth, just restore the state
2196 const size_t ret = llama_state_seq_set_data(ctx, cstate.buf.data(), cstate.buf.size(), 0);
2197 if (ret == 0) {
2198 // if the old state is incompatible with the current context - reprocess from scratch
2199 is_cached = false;
2200 }
2201 }
2202
2203 if (!is_cached) {
2204 if (params.progress) {
2205 fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
2206 i + 1, params.reps);
2207 }
2208 bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
2209 if (!res) {
2210 fprintf(stderr, "%s: error: failed to run depth\n", __func__);
2211 llama_free(ctx);
2212 llama_model_free(lmodel);
2213 exit(1);
2214 }
2215
2216 // store the context state for reuse in later runs
2217 cstate.depth = t.n_depth;
2218 cstate.buf.resize(llama_state_seq_get_size(ctx, 0));
2219 llama_state_seq_get_data(ctx, cstate.buf.data(), cstate.buf.size(), 0);
2220 } else {
2221 if (params.progress) {
2222 fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d (cached)\n", params_idx, params_count,
2223 i + 1, params.reps);
2224 }
2225 }
2226 }
2227
2228 uint64_t t_start = get_time_ns();
2229
2230 if (t.n_prompt > 0) {
2231 if (params.progress) {
2232 fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
2233 i + 1, params.reps);
2234 }
2235 bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
2236 if (!res) {
2237 fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
2238 llama_free(ctx);
2239 llama_model_free(lmodel);
2240 exit(1);
2241 }
2242 }
2243 if (t.n_gen > 0) {
2244 if (params.progress) {
2245 fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
2246 i + 1, params.reps);
2247 }
2248 bool res = test_gen(ctx, t.n_gen, t.n_threads);
2249 if (!res) {
2250 fprintf(stderr, "%s: error: failed to run gen\n", __func__);
2251 llama_free(ctx);
2252 llama_model_free(lmodel);
2253 exit(1);
2254 }
2255 }
2256
2257 uint64_t t_ns = get_time_ns() - t_start;
2258 t.samples_ns.push_back(t_ns);
2259 }
2260
2261 if (p) {
2262 p->print_test(t);
2263 fflush(p->fout);
2264 }
2265
2266 if (p_err) {
2267 p_err->print_test(t);
2268 fflush(p_err->fout);
2269 }
2270
2271 llama_perf_context_print(ctx);
2272
2273 llama_free(ctx);
2274
2275 ggml_threadpool_free_fn(threadpool);
2276 }
2277
2278 llama_model_free(lmodel);
2279
2280 if (p) {
2281 p->print_footer();
2282 }
2283
2284 if (p_err) {
2285 p_err->print_footer();
2286 }
2287
2288 llama_backend_free();
2289
2290 return 0;
2291}