llmnpc - llama.cpp/tests/test-quantize-perf.cpp

Path: llmnpc / llama.cpp / tests / test-quantize-perf.cpp (raw)
  1// Benchmark quantization specific functions on synthetic data
  2
  3#include "ggml.h"
  4#include "ggml-cpu.h"
  5
  6#undef NDEBUG
  7#include <algorithm>
  8#include <assert.h>
  9#include <functional>
 10#include <math.h>
 11#include <memory>
 12#include <stdio.h>
 13#include <string>
 14#include <vector>
 15
 16#if defined(_MSC_VER)
 17#pragma warning(disable: 4244 4267) // possible loss of data
 18#endif
 19
 20#define MAX_ALIGNMENT 64
 21#define QK 32
 22#define WARMUP 5
 23#define ITERATIONS 10
 24#define MAX_ITERATIONS 100000000
 25
 26#define L1_SIZE      32*128
 27#define L2_SIZE     32*2048
 28#define L3_SIZE    32*20480
 29#define MEM_SIZE 32*2048000
 30
 31struct quantize_perf_params {
 32    std::vector<std::string> include_types;
 33    std::vector<size_t> test_sizes;
 34    size_t alignment_offset = 0;
 35    bool op_quantize_row_q_reference = false;
 36    bool op_quantize_row_q = false;
 37    bool op_dequantize_row_q = false;
 38    bool op_quantize_row_q_dot = false;
 39    bool op_vec_dot_q = false;
 40    int64_t iterations = ITERATIONS;
 41};
 42
 43#if defined(__x86_64__) || defined(__i386__)
 44
 45#include <x86intrin.h>
 46inline int64_t cpu_cycles() {
 47// Rough way to detect new-ish CPUs
 48#ifdef __POPCNT__
 49    unsigned int dummy;
 50    return __rdtscp(&dummy);
 51#else
 52    return __rdtsc();
 53#endif
 54}
 55
 56#else
 57
 58#define cpu_cycles() 0
 59
 60#endif
 61
 62
 63// Generate synthetic data
 64static void generate_data(float offset, size_t n, float * dst) {
 65    for (size_t i = 0; i < n; i++) {
 66        dst[i] = 0.1 + 2*cosf(i + offset);
 67    }
 68}
 69
 70static float gigabytes_per_second(size_t bytes, int64_t usecs) {
 71    return bytes / (float) usecs * 1000000 / (1024*1024*1024);
 72}
 73
 74static void * align_with_offset(void * ptr, int offset) {
 75    size_t dummy_size = MAX_ALIGNMENT * 4;
 76    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
 77}
 78
 79static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
 80    int64_t min_time_us = INT64_MAX;
 81    int64_t total_time_us = 0;
 82    int64_t min_time_cycles = INT64_MAX;
 83    int64_t total_time_cycles = 0;
 84
 85    for (int i = 0; i < WARMUP; i++) {
 86        func();
 87    }
 88
 89    for (int i = 0; i < iterations; i++) {
 90        const int64_t start_time = ggml_time_us();
 91        const int64_t start_cycles = cpu_cycles();
 92
 93        func();
 94
 95        const int64_t end_cycles = cpu_cycles();
 96        const int64_t end_time = ggml_time_us();
 97
 98        total_time_cycles += end_cycles - start_cycles;
 99        min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
100        total_time_us += end_time - start_time;
101        min_time_us = std::min(min_time_us, end_time - start_time);
102    }
103
104    printf("      min cycles/%d vals   : %9.2f\n",  QK, QK * min_time_cycles / (float) size);
105    printf("      avg cycles/%d vals   : %9.2f\n",  QK, QK * total_time_cycles / (float) (size * iterations));
106    printf("      float32 throughput   : %9.2f GB/s\n",  gigabytes_per_second(4 * size * iterations, total_time_us));
107    printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * iterations, total_time_us));
108}
109
110static void usage(char * argv[]) {
111    printf("Benchmark quantization specific functions on synthetic data\n");
112    printf("\n");
113    printf("usage: %s [options]\n", argv[0]);
114    printf("\n");
115    printf("options: (default)\n");
116    printf("  -h, --help            show this help message and exit\n");
117    printf("  --size SIZE           set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
118    printf("  -3                    use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
119    printf("  -4                    use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
120    printf("  --op OP               set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
121    printf("                        quantize_row_q_dot, vec_dot_q (all)\n");
122    printf("  --type TYPE           set test type as");
123    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
124        ggml_type type = (ggml_type) i;
125        const auto * qfns     = ggml_get_type_traits(type);
126        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
127        if (ggml_type_name(type) != NULL) {
128            if (qfns_cpu->from_float && qfns->to_float) {
129                printf(" %s", ggml_type_name(type));
130            }
131        }
132    }
133    printf(" (all)\n");
134    printf("  --alignment-offset OFFSET\n");
135    printf("                        set alignment offset as OFFSET (0)\n");
136    printf("  -i NUM, --iterations NUM\n");
137    printf("                        set test iteration number (%d)\n", ITERATIONS);
138}
139
140int main(int argc, char * argv[]) {
141    quantize_perf_params params {};
142
143    // read command line
144
145    bool invalid_param = false;
146    std::string arg;
147    for (int i = 1; i < argc; i++) {
148        arg = argv[i];
149
150        if (arg == "--size") {
151            if (++i >= argc) {
152                invalid_param = true;
153                break;
154            }
155            size_t size = std::stoi(argv[i]);
156            if (size % 32 != 0) {
157                fprintf(stderr, "error: size %zu not divisible by 32\n", size);
158                invalid_param = true;
159                break;
160            }
161            params.test_sizes.push_back(size);
162        } else if (arg == "-3") {
163            // quick select sizes that probably fit in CPU caches
164            params.test_sizes.push_back(L1_SIZE);
165            params.test_sizes.push_back(L2_SIZE);
166            params.test_sizes.push_back(L3_SIZE);
167        } else if (arg == "-4") {
168            // quick select cache sizes + memory
169            params.test_sizes.push_back(L1_SIZE);
170            params.test_sizes.push_back(L2_SIZE);
171            params.test_sizes.push_back(L3_SIZE);
172            params.test_sizes.push_back(MEM_SIZE);
173        } else if (arg == "--op") {
174            if (++i >= argc) {
175                invalid_param = true;
176                break;
177            }
178            std::string op {argv[i]};
179            if (op == "quantize_row_q_reference") {
180                params.op_quantize_row_q_reference = true;
181            } else if (op == "quantize_row_q") {
182                params.op_quantize_row_q = true;
183            } else if (op == "dequantize_row_q") {
184                params.op_dequantize_row_q = true;
185            } else if (op == "quantize_row_q_dot") {
186                params.op_quantize_row_q_dot = true;
187            } else if (op == "vec_dot_q") {
188                params.op_vec_dot_q = true;
189            } else {
190                invalid_param = true;
191                break;
192            }
193        } else if (arg == "--type") {
194            if (++i >= argc) {
195                invalid_param = true;
196                break;
197            }
198            params.include_types.push_back(argv[i]);
199        } else if (arg == "--alignment-offset") {
200            if (++i >= argc) {
201                invalid_param = true;
202                break;
203            }
204            int alignment = std::stoi(argv[i]);
205            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
206            fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
207                invalid_param = true;
208                break;
209            }
210            params.alignment_offset = alignment;
211        } else if ((arg == "-i") || (arg == "--iterations")) {
212            if (++i >= argc) {
213                invalid_param = true;
214                break;
215            }
216            int number = std::stoi(argv[i]);
217            if (number < 0 || number > MAX_ITERATIONS) {
218            fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
219                invalid_param = true;
220                break;
221            }
222            params.iterations = number;
223        } else if ((arg == "-h") || (arg == "--help")) {
224            usage(argv);
225            return 1;
226        } else {
227            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
228            return 1;
229        }
230    }
231    if (invalid_param) {
232        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
233        return 1;
234    }
235
236    if (params.test_sizes.empty()) {
237        params.test_sizes.push_back(L1_SIZE);
238    }
239    if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
240        params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
241    }
242
243    std::sort(params.test_sizes.begin(), params.test_sizes.end());
244    size_t largest = params.test_sizes.back();
245
246    std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
247    std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
248    std::vector<uint8_t> test_q1_v   (largest*4 + MAX_ALIGNMENT*2);
249    std::vector<uint8_t> test_q2_v   (largest*4 + MAX_ALIGNMENT*2);
250    std::vector<uint8_t> test_out_v  (largest*4 + MAX_ALIGNMENT*2);
251
252    float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
253    float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
254    float * test_q1    = (float *) align_with_offset(test_q1_v.data(),    params.alignment_offset);
255    float * test_q2    = (float *) align_with_offset(test_q2_v.data(),    params.alignment_offset);
256    float * test_out   = (float *) align_with_offset(test_out_v.data(),   params.alignment_offset);
257
258    generate_data(0, largest, test_data1);
259    generate_data(1, largest, test_data2);
260
261    int64_t iterations = params.iterations;
262
263    ggml_cpu_init();
264
265    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
266        ggml_type type = (ggml_type) i;
267        const auto * qfns = ggml_get_type_traits(type);
268        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
269        if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
270            continue;
271        }
272
273        if (qfns_cpu->from_float && qfns->to_float) {
274            printf("%s\n", ggml_type_name(type));
275
276            ggml_quantize_init(type);
277
278            if (params.op_quantize_row_q_reference) {
279                printf("  quantize_row_q_reference\n");
280                for (size_t size : params.test_sizes) {
281                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
282                    auto quantize_fn = [&](void) -> float {
283                        qfns->from_float_ref(test_data1, test_q1, size);
284                        return test_q1[0];
285                    };
286                    size_t quantized_size = ggml_row_size(type, size);
287                    benchmark_function(size, quantized_size, iterations, quantize_fn);
288                }
289                printf("\n");
290            }
291
292            if (params.op_quantize_row_q) {
293                printf("  quantize_row_q\n");
294                for (size_t size : params.test_sizes) {
295                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
296                    auto quantize_fn = [&](void) -> float {
297                        qfns_cpu->from_float(test_data1, test_q1, size);
298                        return test_q1[0];
299                    };
300                    size_t quantized_size = ggml_row_size(type, size);
301                    benchmark_function(size, quantized_size, iterations, quantize_fn);
302                }
303                printf("\n");
304            }
305
306            if (params.op_dequantize_row_q) {
307                printf("  dequantize_row_q\n");
308                qfns_cpu->from_float(test_data1, test_q1, largest);
309                for (size_t size : params.test_sizes) {
310                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
311                    auto quantize_fn = [&](void) -> float {
312                        qfns->to_float(test_q1, test_out, size);
313                        return test_out[0];
314                    };
315                    size_t quantized_size = ggml_row_size(type, size);
316                    benchmark_function(size, quantized_size, iterations, quantize_fn);
317                }
318                printf("\n");
319            }
320
321            if (params.op_quantize_row_q_dot) {
322                printf("  quantize_row_q_dot\n");
323                for (size_t size : params.test_sizes) {
324                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
325                    auto quantize_fn = [&](void) -> float {
326                        const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
327                        vdot->from_float(test_data1, test_q1, size);
328                        return test_q1[0];
329                    };
330                    size_t quantized_size = ggml_row_size(type, size);
331                    benchmark_function(size, quantized_size, iterations, quantize_fn);
332                }
333                printf("\n");
334            }
335
336            if (params.op_vec_dot_q) {
337                printf("  vec_dot_q\n");
338                qfns_cpu->from_float(test_data1, test_q1, largest);
339                qfns_cpu->from_float(test_data2, test_q2, largest);
340                for (size_t size : params.test_sizes) {
341                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
342                    auto quantize_fn = [&](void) -> float {
343                        float result;
344                        qfns_cpu->vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
345                        return result;
346                    };
347                    size_t quantized_size = ggml_row_size(type, size);
348                    benchmark_function(size, quantized_size, iterations, quantize_fn);
349                }
350                printf("\n");
351            }
352        }
353    }
354
355    return 0;
356}