1#include "ggml.h"
  2#include "ggml-cpu.h"
  3
  4#include <chrono>
  5#include <iostream>
  6#include <cstdio>
  7#include <cstdlib>
  8#include <cassert>
  9#include <vector>
 10#include <thread>
 11
 12#define MAX_NARGS 2
 13
 14static void test_barrier(int n_threads, int n_rounds) {
 15    struct ggml_init_params params = {
 16        /* .mem_size   = */ 1024*1024*1024,
 17        /* .mem_buffer = */ NULL,
 18        /* .no_alloc   = */ false,
 19    };
 20
 21    struct ggml_context * ctx = ggml_init(params);
 22
 23    // Create graph
 24    struct ggml_cgraph * gf = ggml_new_graph(ctx);
 25
 26    // Lots of small, parallel ops where barriers in between will dominate
 27    struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
 28    for (int i = 0; i < 1000; i++) {
 29        struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
 30        out = ggml_mul_mat(ctx, a, out);
 31
 32        struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
 33        out = ggml_mul_mat(ctx, d, out);
 34    }
 35
 36    ggml_build_forward_expand(gf, out);
 37    int n_nodes = ggml_graph_n_nodes(gf);
 38
 39    // Create threadpool
 40    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
 41    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
 42    if (!threadpool) {
 43        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
 44        exit(1);
 45    }
 46
 47    // The test runs with constant number of threads
 48    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
 49
 50    std::vector<uint8_t> work_data(cplan.work_size);
 51    cplan.work_data = work_data.data();
 52
 53    std::cerr << "graph-compute with"
 54              << "\n n_threads: " << n_threads
 55              << "\n   n_nodes: " << n_nodes
 56              << "\n  n_rounds: " << n_rounds
 57              << "\n";
 58    // ggml_graph_print(gf);
 59
 60    // Warmup
 61    ggml_graph_compute(gf, &cplan);
 62
 63    auto t0 = std::chrono::high_resolution_clock::now();
 64
 65    for (int i=0; i < n_rounds; i++) {
 66        ggml_graph_compute(gf, &cplan);
 67    }
 68
 69    auto t1 = std::chrono::high_resolution_clock::now();
 70
 71    auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
 72    auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
 73    std::cerr << "graph-compute took " << usec << " usec "
 74              << "\n " << (float) usec / n_rounds << " usec per-iter"
 75              << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
 76              << "\n";
 77
 78    ggml_threadpool_free(threadpool);
 79    ggml_free(ctx);
 80}
 81
 82static void test_active(int n_threads, int n_rounds) {
 83    struct ggml_init_params params = {
 84        /* .mem_size   = */ 1024*1024*1024,
 85        /* .mem_buffer = */ NULL,
 86        /* .no_alloc   = */ false,
 87    };
 88
 89    struct ggml_context * ctx = ggml_init(params);
 90
 91    // Create graph
 92    struct ggml_cgraph * gf = ggml_new_graph(ctx);
 93
 94    // Small graph with, parallel ops with barriers
 95    struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
 96    for (int i = 0; i < 2; i++) {
 97        struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
 98        out = ggml_mul_mat(ctx, a, out);
 99
100        struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
101        out = ggml_mul_mat(ctx, d, out);
102    }
103
104    ggml_build_forward_expand(gf, out);
105    int n_nodes = ggml_graph_n_nodes(gf);
106
107    // Create threadpool
108    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
109    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
110    if (!threadpool) {
111        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
112        exit(1);
113    }
114
115    std::cerr << "graph-compute with"
116              << "\n n_threads: " << n_threads
117              << "\n   n_nodes: " << n_nodes
118              << "\n  n_rounds: " << n_rounds
119              << "\n";
120    // ggml_graph_print(gf);
121
122    // In this test we keep changing the number of threads every 4th iteration
123    // to test for race conditions in that path
124
125    for (int i=0; i < n_rounds; i++) {
126        struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
127
128        std::vector<uint8_t> work_data(cplan.work_size);
129        cplan.work_data = work_data.data();
130
131        ggml_graph_compute(gf, &cplan);
132    }
133
134    ggml_threadpool_free(threadpool);
135    ggml_free(ctx);
136}
137
138static void test_multi_graph(int n_threads, int n_rounds) {
139    struct ggml_init_params params = {
140        /* .mem_size   = */ 1024*1024*1024,
141        /* .mem_buffer = */ NULL,
142        /* .no_alloc   = */ false,
143    };
144
145    struct ggml_context * ctx = ggml_init(params);
146
147    // Create graphs
148    struct ggml_cgraph * gf0 = ggml_new_graph(ctx);
149    {
150        // Small graph with parallel ops with barriers
151        struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  64);
152        for (int i = 0; i < 2; i++) {
153            struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
154            out = ggml_mul_mat(ctx, a, out);
155
156            struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
157            out = ggml_mul_mat(ctx, d, out);
158        }
159
160        ggml_build_forward_expand(gf0, out);
161    }
162
163    struct ggml_cgraph * gf1 = ggml_new_graph(ctx);
164    {
165        // Small graph with parallel ops with barriers
166        // Use larger tensors to make sure work_data size is larger than gf0
167        struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,  256);
168        for (int i = 0; i < 4; i++) {
169            struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 256, 128);
170            out = ggml_mul_mat(ctx, a, out);
171
172            struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 256);
173            out = ggml_mul_mat(ctx, d, out);
174        }
175
176        ggml_build_forward_expand(gf1, out);
177    }
178
179
180    // Create threadpool
181    struct ggml_threadpool_params tpp  = ggml_threadpool_params_default(n_threads);
182    struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
183    if (!threadpool) {
184        fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
185        exit(1);
186    }
187
188    std::cerr << "graph-compute with"
189              << "\n gf0 n_nodes: " << ggml_graph_n_nodes(gf0)
190              << "\n gf1 n_nodes: " << ggml_graph_n_nodes(gf1)
191              << "\n   n_threads: " << n_threads
192              << "\n    n_rounds: " << n_rounds
193              << "\n";
194
195    // In this test we keep changing the number of threads every 4th iteration
196    // and we compute two graphs back to back to test graph frequent graph switching
197
198    for (int i=0; i < n_rounds; i++) {
199        struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
200        std::vector<uint8_t> work_data0(cplan0.work_size);
201        cplan0.work_data = work_data0.data();
202
203        struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
204        std::vector<uint8_t> work_data1(cplan1.work_size);
205        cplan1.work_data = work_data1.data();
206
207        ggml_graph_compute(gf0, &cplan0);
208        ggml_graph_compute(gf1, &cplan1);
209    }
210
211    ggml_threadpool_free(threadpool);
212    ggml_free(ctx);
213}
214
215
216int main(int argc, char *argv[]) {
217
218    int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
219    int n_rounds  = 100;
220
221    if (argc > 1) {
222        n_threads = std::atoi(argv[1]);
223    }
224
225    if (argc > 2) {
226        n_rounds  = std::atoi(argv[2]);
227    }
228
229    test_barrier(n_threads, n_rounds);
230
231    test_active(n_threads,  n_rounds * 100);
232
233    test_multi_graph(n_threads,  n_rounds * 10);
234
235    return 0;
236}