1#include "ggml.h"
2#include "ggml-cpu.h"
3
4#include <chrono>
5#include <iostream>
6#include <cstdio>
7#include <cstdlib>
8#include <cassert>
9#include <vector>
10#include <thread>
11
12#define MAX_NARGS 2
13
14static void test_barrier(int n_threads, int n_rounds) {
15 struct ggml_init_params params = {
16 /* .mem_size = */ 1024*1024*1024,
17 /* .mem_buffer = */ NULL,
18 /* .no_alloc = */ false,
19 };
20
21 struct ggml_context * ctx = ggml_init(params);
22
23 // Create graph
24 struct ggml_cgraph * gf = ggml_new_graph(ctx);
25
26 // Lots of small, parallel ops where barriers in between will dominate
27 struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
28 for (int i = 0; i < 1000; i++) {
29 struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
30 out = ggml_mul_mat(ctx, a, out);
31
32 struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
33 out = ggml_mul_mat(ctx, d, out);
34 }
35
36 ggml_build_forward_expand(gf, out);
37 int n_nodes = ggml_graph_n_nodes(gf);
38
39 // Create threadpool
40 struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
41 struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
42 if (!threadpool) {
43 fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
44 exit(1);
45 }
46
47 // The test runs with constant number of threads
48 struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
49
50 std::vector<uint8_t> work_data(cplan.work_size);
51 cplan.work_data = work_data.data();
52
53 std::cerr << "graph-compute with"
54 << "\n n_threads: " << n_threads
55 << "\n n_nodes: " << n_nodes
56 << "\n n_rounds: " << n_rounds
57 << "\n";
58 // ggml_graph_print(gf);
59
60 // Warmup
61 ggml_graph_compute(gf, &cplan);
62
63 auto t0 = std::chrono::high_resolution_clock::now();
64
65 for (int i=0; i < n_rounds; i++) {
66 ggml_graph_compute(gf, &cplan);
67 }
68
69 auto t1 = std::chrono::high_resolution_clock::now();
70
71 auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
72 auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
73 std::cerr << "graph-compute took " << usec << " usec "
74 << "\n " << (float) usec / n_rounds << " usec per-iter"
75 << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
76 << "\n";
77
78 ggml_threadpool_free(threadpool);
79 ggml_free(ctx);
80}
81
82static void test_active(int n_threads, int n_rounds) {
83 struct ggml_init_params params = {
84 /* .mem_size = */ 1024*1024*1024,
85 /* .mem_buffer = */ NULL,
86 /* .no_alloc = */ false,
87 };
88
89 struct ggml_context * ctx = ggml_init(params);
90
91 // Create graph
92 struct ggml_cgraph * gf = ggml_new_graph(ctx);
93
94 // Small graph with, parallel ops with barriers
95 struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
96 for (int i = 0; i < 2; i++) {
97 struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
98 out = ggml_mul_mat(ctx, a, out);
99
100 struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
101 out = ggml_mul_mat(ctx, d, out);
102 }
103
104 ggml_build_forward_expand(gf, out);
105 int n_nodes = ggml_graph_n_nodes(gf);
106
107 // Create threadpool
108 struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
109 struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
110 if (!threadpool) {
111 fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
112 exit(1);
113 }
114
115 std::cerr << "graph-compute with"
116 << "\n n_threads: " << n_threads
117 << "\n n_nodes: " << n_nodes
118 << "\n n_rounds: " << n_rounds
119 << "\n";
120 // ggml_graph_print(gf);
121
122 // In this test we keep changing the number of threads every 4th iteration
123 // to test for race conditions in that path
124
125 for (int i=0; i < n_rounds; i++) {
126 struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
127
128 std::vector<uint8_t> work_data(cplan.work_size);
129 cplan.work_data = work_data.data();
130
131 ggml_graph_compute(gf, &cplan);
132 }
133
134 ggml_threadpool_free(threadpool);
135 ggml_free(ctx);
136}
137
138static void test_multi_graph(int n_threads, int n_rounds) {
139 struct ggml_init_params params = {
140 /* .mem_size = */ 1024*1024*1024,
141 /* .mem_buffer = */ NULL,
142 /* .no_alloc = */ false,
143 };
144
145 struct ggml_context * ctx = ggml_init(params);
146
147 // Create graphs
148 struct ggml_cgraph * gf0 = ggml_new_graph(ctx);
149 {
150 // Small graph with parallel ops with barriers
151 struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
152 for (int i = 0; i < 2; i++) {
153 struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
154 out = ggml_mul_mat(ctx, a, out);
155
156 struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
157 out = ggml_mul_mat(ctx, d, out);
158 }
159
160 ggml_build_forward_expand(gf0, out);
161 }
162
163 struct ggml_cgraph * gf1 = ggml_new_graph(ctx);
164 {
165 // Small graph with parallel ops with barriers
166 // Use larger tensors to make sure work_data size is larger than gf0
167 struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 256);
168 for (int i = 0; i < 4; i++) {
169 struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 256, 128);
170 out = ggml_mul_mat(ctx, a, out);
171
172 struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 256);
173 out = ggml_mul_mat(ctx, d, out);
174 }
175
176 ggml_build_forward_expand(gf1, out);
177 }
178
179
180 // Create threadpool
181 struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
182 struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
183 if (!threadpool) {
184 fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
185 exit(1);
186 }
187
188 std::cerr << "graph-compute with"
189 << "\n gf0 n_nodes: " << ggml_graph_n_nodes(gf0)
190 << "\n gf1 n_nodes: " << ggml_graph_n_nodes(gf1)
191 << "\n n_threads: " << n_threads
192 << "\n n_rounds: " << n_rounds
193 << "\n";
194
195 // In this test we keep changing the number of threads every 4th iteration
196 // and we compute two graphs back to back to test graph frequent graph switching
197
198 for (int i=0; i < n_rounds; i++) {
199 struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
200 std::vector<uint8_t> work_data0(cplan0.work_size);
201 cplan0.work_data = work_data0.data();
202
203 struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
204 std::vector<uint8_t> work_data1(cplan1.work_size);
205 cplan1.work_data = work_data1.data();
206
207 ggml_graph_compute(gf0, &cplan0);
208 ggml_graph_compute(gf1, &cplan1);
209 }
210
211 ggml_threadpool_free(threadpool);
212 ggml_free(ctx);
213}
214
215
216int main(int argc, char *argv[]) {
217
218 int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
219 int n_rounds = 100;
220
221 if (argc > 1) {
222 n_threads = std::atoi(argv[1]);
223 }
224
225 if (argc > 2) {
226 n_rounds = std::atoi(argv[2]);
227 }
228
229 test_barrier(n_threads, n_rounds);
230
231 test_active(n_threads, n_rounds * 100);
232
233 test_multi_graph(n_threads, n_rounds * 10);
234
235 return 0;
236}