1#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
2#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
3
4// suppress warnings in CL headers for GCC and Clang
5#pragma GCC diagnostic ignored "-Woverlength-strings"
6#ifdef __clang__
7#pragma GCC diagnostic ignored "-Wgnu-anonymous-struct"
8#endif
9
10#include "ggml-opencl.h"
11#include "ggml-backend.h"
12#include "ggml-impl.h"
13#include "ggml-backend-impl.h"
14#include "ggml.h"
15
16#include <CL/cl.h>
17
18#include <inttypes.h>
19#include <string.h>
20
21#include <cstddef>
22#include <cstdint>
23#include <fstream>
24#include <vector>
25#include <string>
26#include <cmath>
27#include <map>
28#include <memory>
29#include <charconv>
30#include <mutex>
31
32#undef MIN
33#undef MAX
34#define MIN(a, b) ((a) < (b) ? (a) : (b))
35#define MAX(a, b) ((a) > (b) ? (a) : (b))
36#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
37
38#define UNUSED(x) (void)(x)
39
40#define CL_CHECK(err) \
41 do { \
42 cl_int err_ = (err); \
43 if (err_ != CL_SUCCESS) { \
44 GGML_LOG_ERROR("ggml_opencl: %s error %d at %s:%d\n", \
45 #err, err_, __FILE__, __LINE__); \
46 GGML_ASSERT(0); \
47 } \
48 } while (0)
49
50//------------------------------------------------------------------------------
51// OpenCL
52//------------------------------------------------------------------------------
53
54bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
55
56// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
57// Precompute mp (m' in the paper) and L such that division
58// can be computed using a multiply (high 32b of 64b result)
59// and a shift:
60//
61// n/d = (mulhi(n, mp) + n) >> L;
62struct fastdiv_vals {
63 uint32_t mp;
64 uint32_t L;
65 uint32_t d;
66 uint32_t pad;
67};
68static_assert(sizeof(fastdiv_vals) == 16, "fastdiv_vals size incorrect");
69
70static fastdiv_vals init_fastdiv_values(uint64_t d_64) {
71 GGML_ASSERT(d_64 != 0);
72 GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
73
74 uint32_t d = (uint32_t)d_64;
75
76 // compute L = ceil(log2(d));
77 uint32_t L = 0;
78 while (L < 32 && (uint32_t{ 1 } << L) < d) {
79 L++;
80 }
81
82 uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
83 // pack divisor as well to reduce error surface
84 return { mp, L, d, 0 };
85}
86
87enum GPU_FAMILY {
88 ADRENO,
89 INTEL,
90 UNKNOWN,
91};
92
93enum ADRENO_GPU_GEN {
94 ADRENO_UNKNOWN,
95 A7X,
96 A8X,
97 X1E,
98};
99
100enum ADRENO_CL_COMPILER_TYPE {
101 E031,
102 DX,
103};
104
105struct ggml_cl_version {
106 cl_uint major = 0;
107 cl_uint minor = 0;
108};
109
110
111struct ggml_cl_compiler_version {
112 ADRENO_CL_COMPILER_TYPE type;
113 int major = -1;
114 int minor = -1;
115 int patch = -1;
116
117 bool same(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
118 return major == x && minor == y && patch == z && type == t;
119 }
120 bool newer_than(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
121 return major*10000 + minor*100 + patch > x*10000 + y*100 + z && type == t;
122 }
123 bool newer_than_or_same(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
124 return same(t, x, y, z) || newer_than(t, x, y, z);
125 }
126};
127
128static size_t align_to(size_t value, size_t to_alignment) {
129 GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
130 GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
131
132 return ((value + to_alignment - 1) / to_alignment) * to_alignment;
133}
134
135
136// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
137static ggml_cl_version parse_cl_version(std::string_view str) {
138 size_t major_str_begin = 0;
139 size_t major_str_end = str.find(".", major_str_begin);
140 if (major_str_end == std::string::npos) {
141 return {};
142 }
143
144 size_t minor_str_begin = major_str_end + 1;
145 size_t minor_str_end = str.find(" ", minor_str_begin);
146 if (minor_str_end == std::string::npos) {
147 return {};
148 }
149
150 cl_uint version_major;
151 if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
152 return {};
153 }
154
155 cl_uint version_minor;
156 if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
157 return {};
158 }
159 return { version_major, version_minor };
160}
161
162// Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
163static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
164 size_t param_size;
165 CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, ¶m_size));
166 std::unique_ptr<char[]> param_storage(new char[param_size]);
167 CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
168
169 auto param_value = std::string_view(param_storage.get(), param_size);
170 const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
171 if (param_value.find(version_prefix) != 0) {
172 return {};
173 }
174 param_value.remove_prefix(version_prefix.length());
175 return parse_cl_version(param_value);
176}
177
178// Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
179static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
180 size_t param_size;
181
182#if CL_TARGET_OPENCL_VERSION >= 300
183 if (platform_version.major >= 3) {
184 CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, ¶m_size));
185 if (!param_size) {
186 return {};
187 }
188
189 std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
190 CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
191 unsigned versions_count = param_size / sizeof(cl_name_version);
192
193 cl_version version_max = 0;
194 for (unsigned i = 0; i < versions_count; i++) {
195 version_max = std::max<cl_version>(versions[i].version, version_max);
196 }
197
198 return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
199 }
200#else
201 GGML_UNUSED(platform_version);
202#endif // CL_TARGET_OPENCL_VERSION >= 300
203
204 CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, ¶m_size));
205 if (!param_size) {
206 return {};
207 }
208
209 std::unique_ptr<char[]> param_storage(new char[param_size]);
210 CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
211 auto param_value = std::string_view(param_storage.get(), param_size);
212
213 const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
214 if (param_value.find(version_prefix) != 0) {
215 return {};
216 }
217 param_value.remove_prefix(version_prefix.length());
218
219 return parse_cl_version(param_value);
220}
221
222static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
223 if (strstr(device_name, "730") ||
224 strstr(device_name, "740") ||
225 strstr(device_name, "750")) {
226 return ADRENO_GPU_GEN::A7X;
227 }
228
229 if (strstr(device_name, "830") ||
230 strstr(device_name, "840")) {
231 return ADRENO_GPU_GEN::A8X;
232 }
233
234 if (strstr(device_name, "X1")) {
235 return ADRENO_GPU_GEN::X1E;
236 }
237
238 return ADRENO_GPU_GEN::ADRENO_UNKNOWN;
239}
240
241static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *driver_version) {
242 std::string driver_ver_str(driver_version);
243 ADRENO_CL_COMPILER_TYPE type = ADRENO_CL_COMPILER_TYPE::E031;
244 size_t compiler_ver_pos = driver_ver_str.find("E031");
245 size_t compiler_ver_len = 13;
246 size_t compiler_major_offset = 5;
247 size_t compiler_minor_offset = 8;
248 size_t compiler_patch_offset = 11;
249
250 if (compiler_ver_pos == std::string::npos) {
251 compiler_ver_pos = driver_ver_str.find("DX");
252 if (compiler_ver_pos == std::string::npos) {
253 return {};
254 }
255 type = ADRENO_CL_COMPILER_TYPE::DX;
256 compiler_ver_len = 11;
257 compiler_major_offset = 3;
258 }
259
260 std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len);
261 int major = std::atoi(compiler_ver_str.substr(compiler_major_offset, 2).c_str());
262 int minor = std::atoi(compiler_ver_str.substr(compiler_minor_offset, 2).c_str());
263 int patch = std::atoi(compiler_ver_str.substr(compiler_patch_offset, 2).c_str());
264 return { type, major, minor, patch };
265}
266
267// cl buffer wrapper
268struct ggml_cl_buffer {
269 cl_mem buffer;
270 size_t size;
271
272 ggml_cl_buffer()
273 : buffer(nullptr), size(0) {}
274
275 ~ggml_cl_buffer() {
276 if (buffer) {
277 CL_CHECK(clReleaseMemObject(buffer));
278 }
279 }
280
281 void allocate(cl_context context, size_t new_size) {
282 if (new_size > size) {
283 size = new_size;
284 if (buffer) {
285 CL_CHECK(clReleaseMemObject(buffer));
286 }
287 cl_int err;
288 CL_CHECK((buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
289 }
290 }
291};
292
293// Profiling
294struct ProfilingInfo {
295 std::string op_name;
296 std::string kernel_name;
297
298 cl_kernel kernel;
299 cl_event evt;
300
301 cl_ulong cmd_queued;
302 cl_ulong cmd_submit;
303 cl_ulong cmd_start;
304 cl_ulong cmd_end;
305 cl_ulong overhead_start;
306 cl_ulong overhead_end;
307 // For the times below, see spec for clGetEventProfilingInfo
308 // The time kernel spent in cmd queue - SUBMIT - QUEUED
309 cl_ulong cmd_queued_duration_ns;
310 // The time kernel spent for submission - START - SUBMIT
311 cl_ulong cmd_submit_duration_ns;
312 // Kernel execution time in nanoseconds - END - START
313 cl_ulong cmd_duration_ns;
314 // The time for the kernel to complete - COMPLETE - END
315 cl_ulong cmd_complete_duration_ns;
316 // Total time to finish the kernel - COMPELTE - QUEUED
317 cl_ulong cmd_total_duration_ns;
318 // Global and local work sizes.
319 size_t global_size[3];
320 size_t local_size[3];
321 // Op output size.
322 size_t output_size[4];
323};
324
325static void populateProfilingInfo(
326 ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
327 size_t global_size[3], size_t local_size[3],
328 const ggml_tensor * tensor) {
329 info.op_name = tensor->name;
330 info.kernel = kernel;
331 info.evt = evt;
332
333 // 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
334 info.local_size[0] = 0;
335 info.local_size[1] = 0;
336 info.local_size[2] = 0;
337
338 info.global_size[0] = 0;
339 info.global_size[1] = 0;
340 info.global_size[2] = 0;
341
342 if (local_size) {
343 for (cl_uint i = 0; i < work_dim; ++i) {
344 info.local_size[i] = local_size[i];
345 }
346 }
347
348 for (cl_uint i = 0; i < work_dim; ++i) {
349 info.global_size[i] = global_size[i];
350 }
351
352 info.output_size[0] = tensor->ne[0];
353 info.output_size[1] = tensor->ne[1];
354 info.output_size[2] = tensor->ne[2];
355 info.output_size[3] = tensor->ne[3];
356}
357
358struct ggml_backend_opencl_context;
359
360// backend device context
361struct ggml_backend_opencl_device_context {
362 cl_platform_id platform;
363 std::string platform_name;
364
365 cl_device_id device;
366 std::string device_name;
367 cl_device_type device_type;
368 std::string device_version;
369
370 // Initialized by ggml_cl2_init().
371 ggml_backend_opencl_context * backend_ctx = nullptr;
372
373 // Initialized by ggml_backend_opencl_device_get_buffer_type()
374 ggml_backend_buffer_type buffer_type;
375
376 cl_context context = nullptr;
377};
378
379// backend context
380struct ggml_backend_opencl_context {
381 int ref_count;
382
383 cl_device_id device;
384 std::string device_name;
385
386 std::string driver_version;
387
388 GPU_FAMILY gpu_family;
389 ADRENO_GPU_GEN adreno_gen;
390
391 cl_int alignment;
392 size_t max_alloc_size;
393 size_t max_workgroup_size;
394 bool fp16_support;
395 bool has_vector_subgroup_broadcast;
396 bool disable_fusion;
397 ggml_cl_compiler_version adreno_cl_compiler_version;
398
399 int adreno_wave_size;
400
401 cl_bool non_uniform_workgroups;
402 size_t image_max_buffer_size;
403
404 cl_context context;
405 cl_command_queue queue;
406
407 // prealloc buffers for transposing weights and activations
408 ggml_cl_buffer prealloc_quant_trans;
409 ggml_cl_buffer prealloc_scales_trans;
410 ggml_cl_buffer prealloc_act_trans;
411
412 // prealloc buffers for src0 and src1
413 ggml_cl_buffer prealloc_src0;
414 ggml_cl_buffer prealloc_src1;
415
416 cl_program program_add;
417 cl_program program_add_id;
418 cl_program program_clamp;
419 cl_program program_cpy;
420 cl_program program_cvt;
421 cl_program program_diag_mask_inf;
422 cl_program program_gelu;
423 cl_program program_gemv_noshuffle_general;
424 cl_program program_gemv_noshuffle;
425 cl_program program_get_rows;
426 cl_program program_set_rows;
427 cl_program program_glu;
428 cl_program program_im2col_f16;
429 cl_program program_im2col_f32;
430 cl_program program_mul_mat_Ab_Bi_8x4;
431 cl_program program_mul_mv_q4_0_f32;
432 cl_program program_mul_mv_q4_0_f32_v;
433 cl_program program_mul_mv_q4_0_f32_8x_flat;
434 cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
435 cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
436 cl_program program_mul_mv_q6_K;
437 cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat;
438 cl_program program_mul_mv_mxfp4_f32;
439 cl_program program_mul_mv_mxfp4_f32_flat;
440 cl_program program_mul_mv_f16_f16;
441 cl_program program_mul_mv_f16_f32_1row;
442 cl_program program_mul_mv_f16_f32_l4;
443 cl_program program_mul_mv_f16_f32;
444 cl_program program_mul_mv_f32_f32;
445 cl_program program_mul;
446 cl_program program_mul_mat_f16_f32_tiled;
447 cl_program program_mul_mm_f16_f32_kqv;
448 cl_program program_mul_mm_f16_f32_kq;
449 cl_program program_div;
450 cl_program program_sub;
451 cl_program program_norm;
452 cl_program program_relu;
453 cl_program program_rms_norm;
454 cl_program program_group_norm;
455 cl_program program_rope;
456 cl_program program_silu;
457 cl_program program_sigmoid;
458 cl_program program_softmax_f32;
459 cl_program program_softmax_f16;
460 cl_program program_softmax_4_f32;
461 cl_program program_softmax_4_f16;
462 cl_program program_argsort_f32_i32;
463 cl_program program_sum_rows_f32;
464 cl_program program_pad;
465 cl_program program_upscale;
466 cl_program program_conv_2d_f16;
467 cl_program program_conv_2d_f32;
468 cl_program program_conv_2d_f16_f32;
469 cl_program program_tsembd;
470 cl_program program_gemv_moe_mxfp4_f32, program_gemm_moe_mxfp4_f32;
471 cl_program program_mul_mv_id_q4_0_f32_8x_flat;
472 cl_program program_mul_mv_id_q8_0_f32, program_mul_mv_id_q8_0_f32_flat;
473 cl_program program_mul_mv_id_mxfp4_f32;
474 cl_program program_mul_mv_id_mxfp4_f32_flat;
475 cl_program program_mul_mm_f32_f32_l4_lm;
476 cl_program program_mul_mm_f16_f32_l4_lm;
477 cl_program program_mul_mm_q8_0_f32_l4_lm;
478
479 cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
480 cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
481 cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
482 cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
483 cl_kernel kernel_add_id;
484 cl_kernel kernel_scale_f32, kernel_scale_f32_4;
485 cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
486 cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
487 cl_kernel kernel_mean_f32;
488 cl_kernel kernel_silu, kernel_silu_4;
489 cl_kernel kernel_gelu, kernel_gelu_4;
490 cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
491 cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
492 cl_kernel kernel_relu;
493 cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
494 cl_kernel kernel_tri;
495 cl_kernel kernel_fill;
496 cl_kernel kernel_clamp;
497 cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_swiglu_oai, kernel_geglu_erf, kernel_geglu_quick,
498 kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
499 cl_kernel kernel_norm, kernel_norm_mul_add;
500 cl_kernel kernel_rms_norm, kernel_rms_norm_mul;
501 cl_kernel kernel_group_norm, kernel_group_norm_mul_add;
502 cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
503 cl_kernel kernel_soft_max, kernel_soft_max_4;
504 cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
505 std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f16;
506 std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f16_q1;
507 std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f32;
508 std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f32_q1;
509 std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f32_f16;
510 std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f32_f16_q1;
511 std::map<std::pair<int, int>, int> kernels_flash_attn_bm;
512 std::map<std::pair<int, int>, int> kernels_flash_attn_bn;
513 cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
514 cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
515 cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
516 cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
517 cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
518 cl_kernel kernel_mul_mat_f32_f32;
519 cl_kernel kernel_mul_mat_f16_f16;
520 cl_kernel kernel_mul_mat_f16_f32_1row;
521 cl_kernel kernel_mul_mat_f16_f32;
522 cl_kernel kernel_mul_mat_f16_f32_l4;
523 cl_kernel kernel_mul_mat_f16_f32_tiled;
524 cl_kernel kernel_mul_mm_f16_f32_kqv;
525 cl_kernel kernel_mul_mm_f16_f32_kq;
526 cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
527 cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
528 cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
529 cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
530 cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
531 cl_kernel kernel_convert_block_q4_0_noshuffle;
532 cl_kernel kernel_restore_block_q4_0_noshuffle;
533 cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
534 cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
535 cl_kernel kernel_mul_mv_q4_K_f32;
536 cl_kernel kernel_mul_mv_q6_K_f32;
537 cl_kernel kernel_mul_mv_q6_K_f32_flat;
538 cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
539 cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
540 cl_kernel kernel_solve_tri_f32;
541 cl_kernel kernel_im2col_f32, kernel_im2col_f16;
542 cl_kernel kernel_argsort_f32_i32;
543 cl_kernel kernel_sum_rows_f32;
544 cl_kernel kernel_repeat_f32;
545 cl_kernel kernel_pad;
546 cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
547 cl_kernel kernel_tanh_f16, kernel_tanh_f16_4, kernel_tanh_f16_nc;
548 cl_kernel kernel_expm1_f32_nd;
549 cl_kernel kernel_expm1_f16_nd;
550 cl_kernel kernel_softplus_f32_nd;
551 cl_kernel kernel_softplus_f16_nd;
552 cl_kernel kernel_upscale;
553 cl_kernel kernel_upscale_bilinear;
554 cl_kernel kernel_concat_f32;
555 cl_kernel kernel_conv_2d_f16;
556 cl_kernel kernel_conv_2d_f32;
557 cl_kernel kernel_conv_2d_f16_f32;
558 cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4;
559 cl_kernel kernel_timestep_embedding;
560 cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
561 cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
562 cl_kernel kernel_mul_mv_id_q8_0_f32, kernel_mul_mv_id_q8_0_f32_flat;
563 cl_kernel kernel_mul_mv_id_mxfp4_f32;
564 cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
565 cl_kernel kernel_mul_mm_f32_f32_l4_lm;
566 cl_kernel kernel_mul_mm_f16_f32_l4_lm;
567 cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;
568 cl_kernel kernel_mul_mm_q6_k_f32_l4_lm;
569
570 std::vector<ProfilingInfo> profiling_info;
571
572 void write_profiling_info() {
573 FILE * fperf = fopen("cl_profiling.csv", "w");
574 if (!fperf) {
575 GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
576 return;
577 }
578
579 // Populate profiling info
580 for (ProfilingInfo & info : profiling_info) {
581 cl_ulong cmd_queued;
582 cl_ulong cmd_submit;
583 cl_ulong cmd_start;
584 cl_ulong cmd_end;
585 cl_ulong cmd_complete;
586
587 CL_CHECK(clWaitForEvents(1, &info.evt));
588 CL_CHECK(clGetEventProfilingInfo(
589 info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
590 CL_CHECK(clGetEventProfilingInfo(
591 info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
592 CL_CHECK(clGetEventProfilingInfo(
593 info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
594 CL_CHECK(clGetEventProfilingInfo(
595 info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
596 CL_CHECK(clGetEventProfilingInfo(
597 info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
598 CL_CHECK(clReleaseEvent(info.evt));
599
600 char kernel_name[512];
601 CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
602 sizeof(kernel_name), kernel_name, NULL));
603 info.kernel_name = kernel_name;
604
605 info.cmd_queued = cmd_queued;
606 info.cmd_submit = cmd_submit;
607 info.cmd_start = cmd_start;
608 info.cmd_end = cmd_end;
609
610 info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
611 info.cmd_submit_duration_ns = cmd_start - cmd_submit;
612 info.cmd_duration_ns = cmd_end - cmd_start;
613 info.cmd_complete_duration_ns = cmd_complete - cmd_end;
614 info.cmd_total_duration_ns = cmd_complete - cmd_queued;
615 }
616
617 // Dump a csv
618 fprintf(fperf, "op name, kernel name, exec duration (ms), global size, local size, output size\n");
619 for (const ProfilingInfo & info : profiling_info) {
620 fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
621 info.op_name.c_str(), info.kernel_name.c_str(),
622 info.cmd_duration_ns/1.e6f,
623 info.global_size[0], info.global_size[1], info.global_size[2],
624 info.local_size[0], info.local_size[1], info.local_size[2],
625 info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
626 }
627 fclose(fperf);
628
629 // Dump a simple chrome trace
630 FILE* ftrace = fopen("cl_trace.json", "w");
631 if (!ftrace) {
632 GGML_LOG_ERROR("Failed to open cl_trace.json\n");
633 return;
634 }
635
636 fprintf(ftrace, "[\n");
637 for (const ProfilingInfo & info : profiling_info) {
638 fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
639 info.kernel_name.c_str(), info.cmd_queued/1000);
640 fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
641 info.kernel_name.c_str(), info.cmd_submit/1000);
642
643 fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
644 info.kernel_name.c_str(), info.cmd_start/1000);
645 fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
646 info.kernel_name.c_str(), info.cmd_end/1000);
647 }
648 fclose(ftrace);
649 }
650
651 size_t get_kernel_workgroup_size(cl_kernel kernel) const {
652 size_t workgroup_size = 0;
653 size_t ret_size = 0;
654 CL_CHECK(
655 clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
656 sizeof(size_t), &workgroup_size, &ret_size));
657 GGML_ASSERT(sizeof(size_t) == ret_size);
658 return workgroup_size;
659 }
660
661 void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
662#ifdef GGML_OPENCL_PROFILING
663 cl_event evt;
664 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
665
666 profiling_info.emplace_back();
667 populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
668#else
669 GGML_UNUSED(tensor);
670 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
671#endif
672 }
673
674#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
675 // Transpose kernels
676 cl_program program_transpose;
677
678 cl_kernel kernel_transpose_32;
679 cl_kernel kernel_transpose_32_16;
680 cl_kernel kernel_transpose_16;
681 cl_kernel kernel_transpose_16_buf;
682 cl_kernel kernel_transpose_16_4x1;
683
684 // Gemm and Gemv related programs, kernels, etc
685 cl_program program_CL_gemm;
686 cl_program program_CL_gemv_general;
687 cl_program program_CL_gemv_4096_1_11008;
688 cl_program program_CL_gemv_4096_1_4096;
689 cl_program program_CL_gemv_11008_1_4096;
690 cl_program program_CL_gemv_32000_1_4096;
691 cl_kernel CL_mul_mat_Ab_Bi_8x4;
692 cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
693 cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
694 cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
695 cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
696 cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
697 cl_kernel kernel_mul_mm_q8_0_f32_8x4;
698 cl_kernel CL_mul_mat_vec_q8_0_f32;
699#endif // GGML_OPENCL_USE_ADRENO_KERNELS
700
701 void free() {
702 ref_count--;
703 if (ref_count == 0) {
704#ifdef GGML_OPENCL_PROFILING
705 write_profiling_info();
706 profiling_info.clear();
707#endif
708 }
709 }
710};
711
712// All registered devices with a default device in the front.
713static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
714
715inline std::string read_file(const std::string &path) {
716 std::ifstream ifs(path);
717 if (!ifs) {
718 return "";
719 }
720 std::string text;
721 ifs.seekg(0, std::ios::end);
722 text.resize(ifs.tellg());
723 ifs.seekg(0, std::ios::beg);
724 ifs.read(&text[0], text.size());
725 return text;
726}
727
728static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
729 cl_program p;
730 char *program_log;
731 size_t program_size;
732 size_t log_size;
733 int err;
734
735 program_size = strlen(program_buffer);
736
737 p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
738 if(err < 0) {
739 GGML_LOG_ERROR("OpenCL error creating program");
740 exit(1);
741 }
742
743 err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
744 if(err < 0) {
745 clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
746 program_log = (char*) malloc(log_size + 1);
747 program_log[log_size] = '\0';
748 clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
749 GGML_LOG_ERROR("ggml_opencl: kernel compile error:\n\n%s\n", program_log);
750 free(program_log);
751 exit(1);
752 }
753
754 return p;
755}
756
757static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_version opencl_c_version) {
758 cl_int err;
759
760 // compiler options for general kernels
761 auto opencl_c_std =
762 std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
763 std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
764 " -cl-mad-enable -cl-unsafe-math-optimizations"
765 " -cl-finite-math-only -cl-fast-relaxed-math";
766
767 GGML_LOG_INFO("ggml_opencl: loading OpenCL kernels");
768
769 // add
770 {
771#ifdef GGML_OPENCL_EMBED_KERNELS
772 const std::string kernel_src {
773 #include "add.cl.h"
774 };
775#else
776 const std::string kernel_src = read_file("add.cl");
777#endif
778 backend_ctx->program_add =
779 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
780
781 CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
782 CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
783 CL_CHECK((backend_ctx->kernel_add_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_f16", &err), err));
784 CL_CHECK((backend_ctx->kernel_add_row_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_row_f16", &err), err));
785 GGML_LOG_CONT(".");
786 }
787
788 // add_id
789 {
790#ifdef GGML_OPENCL_EMBED_KERNELS
791 const std::string kernel_src {
792 #include "add_id.cl.h"
793 };
794#else
795 const std::string kernel_src = read_file("add_id.cl");
796#endif
797 backend_ctx->program_add_id =
798 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
799
800 CL_CHECK((backend_ctx->kernel_add_id = clCreateKernel(backend_ctx->program_add_id, "kernel_add_id", &err), err));
801 GGML_LOG_CONT(".");
802 }
803
804 // tri
805 {
806#ifdef GGML_OPENCL_EMBED_KERNELS
807 const std::string kernel_src {
808 #include "tri.cl.h"
809 };
810#else
811 const std::string kernel_src = read_file("tri.cl");
812#endif
813 cl_program prog =
814 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
815
816 CL_CHECK((backend_ctx->kernel_tri = clCreateKernel(prog, "kernel_tri_f32", &err), err));
817 GGML_LOG_CONT(".");
818
819 CL_CHECK(clReleaseProgram(prog));
820 }
821
822 // fill
823 {
824#ifdef GGML_OPENCL_EMBED_KERNELS
825 const std::string kernel_src {
826 #include "fill.cl.h"
827 };
828#else
829 const std::string kernel_src = read_file("fill.cl");
830#endif
831 cl_program prog =
832 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
833
834 CL_CHECK((backend_ctx->kernel_fill = clCreateKernel(prog, "kernel_fill_f32", &err), err));
835 GGML_LOG_CONT(".");
836
837 CL_CHECK(clReleaseProgram(prog));
838 }
839
840 // clamp
841 {
842#ifdef GGML_OPENCL_EMBED_KERNELS
843 const std::string kernel_src {
844 #include "clamp.cl.h"
845 };
846#else
847 const std::string kernel_src = read_file("clamp.cl");
848#endif
849 backend_ctx->program_clamp =
850 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
851
852 CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program_clamp, "kernel_clamp", &err), err));
853 GGML_LOG_CONT(".");
854 }
855
856 // cpy
857 {
858#ifdef GGML_OPENCL_EMBED_KERNELS
859 const std::string kernel_src {
860 #include "cpy.cl.h"
861 };
862#else
863 const std::string kernel_src = read_file("cpy.cl");
864#endif
865 backend_ctx->program_cpy =
866 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
867
868 CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f16", &err), err));
869 CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f32", &err), err));
870 CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f16", &err), err));
871 CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f32", &err), err));
872 GGML_LOG_CONT(".");
873 }
874
875 // cvt
876 {
877#ifdef GGML_OPENCL_EMBED_KERNELS
878 const std::string kernel_src {
879 #include "cvt.cl.h"
880 };
881#else
882 const std::string kernel_src = read_file("cvt.cl");
883#endif
884 backend_ctx->program_cvt =
885 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
886
887 CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
888 CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
889 CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
890 CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
891 CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
892 CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err));
893 CL_CHECK((backend_ctx->kernel_restore_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4_trans", &err), err));
894 CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
895 CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
896 CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
897 CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
898 CL_CHECK((backend_ctx->kernel_convert_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
899 CL_CHECK((backend_ctx->kernel_restore_block_q6_K = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
900 GGML_LOG_CONT(".");
901 }
902
903 // diag_mask_inf
904 {
905#ifdef GGML_OPENCL_EMBED_KERNELS
906 const std::string kernel_src {
907 #include "diag_mask_inf.cl.h"
908 };
909#else
910 const std::string kernel_src = read_file("diag_mask_inf.cl");
911#endif
912 backend_ctx->program_diag_mask_inf =
913 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
914
915 CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program_diag_mask_inf, "kernel_diag_mask_inf_8", &err), err));
916 CL_CHECK((backend_ctx->kernel_diag_mask_inf = clCreateKernel(backend_ctx->program_diag_mask_inf, "kernel_diag_mask_inf", &err), err));
917 GGML_LOG_CONT(".");
918 }
919
920 // gelu
921 {
922#ifdef GGML_OPENCL_EMBED_KERNELS
923 const std::string kernel_src {
924 #include "gelu.cl.h"
925 };
926#else
927 const std::string kernel_src = read_file("gelu.cl");
928#endif
929 backend_ctx->program_gelu =
930 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
931
932 CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err));
933 CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err));
934 CL_CHECK((backend_ctx->kernel_gelu_erf = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf", &err), err));
935 CL_CHECK((backend_ctx->kernel_gelu_erf_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf_4", &err), err));
936 CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err));
937 CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err));
938 GGML_LOG_CONT(".");
939 }
940
941 // glu
942 {
943#ifdef GGML_OPENCL_EMBED_KERNELS
944 const std::string kernel_src {
945 #include "glu.cl.h"
946 };
947#else
948 const std::string kernel_src = read_file("glu.cl");
949#endif
950 backend_ctx->program_glu =
951 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
952
953 CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
954 CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
955 CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
956 CL_CHECK((backend_ctx->kernel_swiglu_oai = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_oai", &err), err));
957 CL_CHECK((backend_ctx->kernel_geglu_erf = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf", &err), err));
958 CL_CHECK((backend_ctx->kernel_geglu_quick = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick", &err), err));
959 CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
960 CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
961 CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
962 CL_CHECK((backend_ctx->kernel_geglu_erf_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf_f16", &err), err));
963 CL_CHECK((backend_ctx->kernel_geglu_quick_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick_f16", &err), err));
964 GGML_LOG_CONT(".");
965 }
966
967 // get_rows
968 {
969#ifdef GGML_OPENCL_EMBED_KERNELS
970 const std::string kernel_src {
971 #include "get_rows.cl.h"
972 };
973#else
974 const std::string kernel_src = read_file("get_rows.cl");
975#endif
976 backend_ctx->program_get_rows =
977 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
978
979 CL_CHECK((backend_ctx->kernel_get_rows_f32 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_f32", &err), err));
980 CL_CHECK((backend_ctx->kernel_get_rows_f16 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_f16", &err), err));
981 CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_q4_0", &err), err));
982 GGML_LOG_CONT(".");
983 }
984
985 // solve_tri_f32
986 {
987#ifdef GGML_OPENCL_EMBED_KERNELS
988 const std::string kernel_src {
989 #include "solve_tri.cl.h"
990 };
991#else
992 const std::string kernel_src = read_file("solve_tri.cl");
993#endif
994 cl_program prog =
995 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
996
997 CL_CHECK((backend_ctx->kernel_solve_tri_f32 = clCreateKernel(prog, "kernel_solve_tri_f32", &err), err));
998 GGML_LOG_CONT(".");
999 CL_CHECK(clReleaseProgram(prog));
1000 }
1001
1002 // im2col_f32
1003 {
1004#ifdef GGML_OPENCL_EMBED_KERNELS
1005 const std::string kernel_src {
1006 #include "im2col_f32.cl.h"
1007 };
1008#else
1009 const std::string kernel_src = read_file("im2col_f32.cl");
1010#endif
1011 backend_ctx->program_im2col_f32 =
1012 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1013
1014 CL_CHECK((backend_ctx->kernel_im2col_f32 = clCreateKernel(backend_ctx->program_im2col_f32, "kernel_im2col_f32", &err), err));
1015 GGML_LOG_CONT(".");
1016 }
1017
1018 // im2col_f16
1019 {
1020#ifdef GGML_OPENCL_EMBED_KERNELS
1021 const std::string kernel_src {
1022 #include "im2col_f16.cl.h"
1023 };
1024#else
1025 const std::string kernel_src = read_file("im2col_f16.cl");
1026#endif
1027 backend_ctx->program_im2col_f16 =
1028 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1029
1030 CL_CHECK((backend_ctx->kernel_im2col_f16 = clCreateKernel(backend_ctx->program_im2col_f16, "kernel_im2col_f16", &err), err));
1031 GGML_LOG_CONT(".");
1032 }
1033
1034 // mul_mv_q4_0_f32
1035 {
1036#ifdef GGML_OPENCL_EMBED_KERNELS
1037 const std::string kernel_src {
1038 #include "mul_mv_q4_0_f32.cl.h"
1039 };
1040#else
1041 const std::string kernel_src = read_file("mul_mv_q4_0_f32.cl");
1042#endif
1043 backend_ctx->program_mul_mv_q4_0_f32 =
1044 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1045
1046 CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32, "kernel_mul_mat_q4_0_f32", &err), err));
1047 GGML_LOG_CONT(".");
1048 }
1049
1050 // mul_mv_q4_0_f32_v
1051 {
1052#ifdef GGML_OPENCL_EMBED_KERNELS
1053 const std::string kernel_src {
1054 #include "mul_mv_q4_0_f32_v.cl.h"
1055 };
1056#else
1057 const std::string kernel_src = read_file("mul_mv_q4_0_f32_v.cl");
1058#endif
1059 backend_ctx->program_mul_mv_q4_0_f32_v =
1060 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1061
1062 CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_v, "kernel_mul_mat_q4_0_f32_v", &err), err));
1063 GGML_LOG_CONT(".");
1064 }
1065
1066 // mul_mv_q4_0_f32_8x_flat
1067 {
1068#ifdef GGML_OPENCL_EMBED_KERNELS
1069 const std::string kernel_src {
1070 #include "mul_mv_q4_0_f32_8x_flat.cl.h"
1071 };
1072#else
1073 const std::string kernel_src = read_file("mul_mv_q4_0_f32_8x_flat.cl");
1074#endif
1075 backend_ctx->program_mul_mv_q4_0_f32_8x_flat =
1076 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1077
1078 CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_8x_flat, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
1079 GGML_LOG_CONT(".");
1080 }
1081
1082 // mul_mv_q4_0_f32_1d_8x_flat
1083 // This kernel does not compiler on Adreno cl compiler 38.01. Skip it for
1084 // those compiler versions since it is anyway not used for Adreno.
1085 if (backend_ctx->gpu_family != ADRENO ||
1086 backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) ||
1087 backend_ctx->adreno_cl_compiler_version.type == DX) {
1088#ifdef GGML_OPENCL_EMBED_KERNELS
1089 const std::string kernel_src {
1090 #include "mul_mv_q4_0_f32_1d_8x_flat.cl.h"
1091 };
1092#else
1093 const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_8x_flat.cl");
1094#endif
1095 backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat =
1096 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1097
1098 CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
1099 GGML_LOG_CONT(".");
1100 }
1101
1102 // mul_mv_q4_0_f32_1d_16x_flat
1103 // This kernel does not compiler on Adreno cl compiler 38.01. Skip it for
1104 // those compiler versions since it is anyway not used for Adreno.
1105 if (backend_ctx->gpu_family != ADRENO ||
1106 backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) ||
1107 backend_ctx->adreno_cl_compiler_version.type == DX) {
1108#ifdef GGML_OPENCL_EMBED_KERNELS
1109 const std::string kernel_src {
1110 #include "mul_mv_q4_0_f32_1d_16x_flat.cl.h"
1111 };
1112#else
1113 const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_16x_flat.cl");
1114#endif
1115 backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat =
1116 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1117
1118 CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
1119 GGML_LOG_CONT(".");
1120 }
1121
1122 // mul_mv_q4_k_f32
1123 {
1124#ifdef GGML_OPENCL_EMBED_KERNELS
1125 const std::string kernel_src {
1126 #include "mul_mv_q4_k_f32.cl.h"
1127 };
1128#else
1129 const std::string kernel_src = read_file("mul_mv_q4_k_f32.cl");
1130#endif
1131 cl_program prog =
1132 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1133
1134 CL_CHECK((backend_ctx->kernel_mul_mv_q4_K_f32 = clCreateKernel(prog, "kernel_mul_mv_q4_K_f32", &err), err));
1135 CL_CHECK(clReleaseProgram(prog));
1136 GGML_LOG_CONT(".");
1137 }
1138
1139 // mul_mv_q6_k_f32
1140 {
1141#ifdef GGML_OPENCL_EMBED_KERNELS
1142 const std::string kernel_src {
1143 #include "mul_mv_q6_k_f32.cl.h"
1144 };
1145#else
1146 const std::string kernel_src = read_file("mul_mv_q6_k_f32.cl");
1147#endif
1148 backend_ctx->program_mul_mv_q6_K =
1149 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1150
1151 CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_mul_mv_q6_K, "kernel_mul_mv_q6_K_f32", &err), err));
1152 GGML_LOG_CONT(".");
1153 }
1154
1155 // mul_mv_q6_k_f32_flat
1156 {
1157#ifdef GGML_OPENCL_EMBED_KERNELS
1158 const std::string kernel_src {
1159 #include "mul_mv_q6_k_f32_flat.cl.h"
1160 };
1161#else
1162 const std::string kernel_src = read_file("mul_mv_q6_k_f32_flat.cl");
1163#endif
1164 cl_program prog =
1165 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1166
1167 CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q6_K_f32_flat", &err), err));
1168 CL_CHECK(clReleaseProgram(prog));
1169 GGML_LOG_CONT(".");
1170 }
1171
1172 // mul_mv_q8_0_f32
1173 {
1174#ifdef GGML_OPENCL_EMBED_KERNELS
1175 const std::string kernel_src {
1176 #include "mul_mv_q8_0_f32.cl.h"
1177 };
1178#else
1179 const std::string kernel_src = read_file("mul_mv_q8_0_f32.cl");
1180#endif
1181 backend_ctx->program_mul_mv_q8_0_f32 =
1182 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1183
1184 CL_CHECK((backend_ctx->kernel_mul_mv_q8_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_q8_0_f32, "kernel_mul_mv_q8_0_f32", &err), err));
1185 GGML_LOG_CONT(".");
1186 }
1187
1188 // mul_mv_q8_0_f32_flat
1189 {
1190#ifdef GGML_OPENCL_EMBED_KERNELS
1191 const std::string kernel_src {
1192 #include "mul_mv_q8_0_f32_flat.cl.h"
1193 };
1194#else
1195 const std::string kernel_src = read_file("mul_mv_q8_0_f32_flat.cl");
1196#endif
1197 backend_ctx->program_mul_mv_q8_0_f32_flat =
1198 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1199
1200 CL_CHECK((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_q8_0_f32_flat, "kernel_mul_mv_q8_0_f32_flat", &err), err));
1201 GGML_LOG_CONT(".");
1202 }
1203
1204 // mul_mv_mxfp4_f32
1205 {
1206#ifdef GGML_OPENCL_EMBED_KERNELS
1207 const std::string kernel_src {
1208 #include "mul_mv_mxfp4_f32.cl.h"
1209 };
1210#else
1211 const std::string kernel_src = read_file("mul_mv_mxfp4_f32.cl");
1212#endif
1213 backend_ctx->program_mul_mv_mxfp4_f32 =
1214 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1215
1216 CL_CHECK((backend_ctx->kernel_mul_mv_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_mxfp4_f32, "kernel_mul_mv_mxfp4_f32", &err), err));
1217 GGML_LOG_CONT(".");
1218 }
1219
1220 // mul_mv_mxfp4_f32_flat
1221 {
1222#ifdef GGML_OPENCL_EMBED_KERNELS
1223 const std::string kernel_src {
1224 #include "mul_mv_mxfp4_f32_flat.cl.h"
1225 };
1226#else
1227 const std::string kernel_src = read_file("mul_mv_mxfp4_f32_flat.cl");
1228#endif
1229 backend_ctx->program_mul_mv_mxfp4_f32_flat =
1230 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1231
1232 CL_CHECK((backend_ctx->kernel_mul_mv_mxfp4_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_mxfp4_f32_flat, "kernel_mul_mv_mxfp4_f32_flat", &err), err));
1233 GGML_LOG_CONT(".");
1234 }
1235
1236 // mul_mv_f16_f16
1237 {
1238#ifdef GGML_OPENCL_EMBED_KERNELS
1239 const std::string kernel_src {
1240 #include "mul_mv_f16_f16.cl.h"
1241 };
1242#else
1243 const std::string kernel_src = read_file("mul_mv_f16_f16.cl");
1244#endif
1245 backend_ctx->program_mul_mv_f16_f16 =
1246 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1247
1248 CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program_mul_mv_f16_f16, "kernel_mul_mat_f16_f16", &err), err));
1249 GGML_LOG_CONT(".");
1250 }
1251
1252 // mul_mv_f16_f32_1row
1253 {
1254#ifdef GGML_OPENCL_EMBED_KERNELS
1255 const std::string kernel_src {
1256 #include "mul_mv_f16_f32_1row.cl.h"
1257 };
1258#else
1259 const std::string kernel_src = read_file("mul_mv_f16_f32_1row.cl");
1260#endif
1261 backend_ctx->program_mul_mv_f16_f32_1row =
1262 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1263
1264 CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_1row, "kernel_mul_mat_f16_f32_1row", &err), err));
1265 GGML_LOG_CONT(".");
1266 }
1267
1268 // mul_mv_f16_f32_l4
1269 {
1270#ifdef GGML_OPENCL_EMBED_KERNELS
1271 const std::string kernel_src {
1272 #include "mul_mv_f16_f32_l4.cl.h"
1273 };
1274#else
1275 const std::string kernel_src = read_file("mul_mv_f16_f32_l4.cl");
1276#endif
1277 backend_ctx->program_mul_mv_f16_f32_l4 =
1278 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1279
1280 CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err));
1281 GGML_LOG_CONT(".");
1282 }
1283
1284 // mul_mv_f16_f32
1285 {
1286#ifdef GGML_OPENCL_EMBED_KERNELS
1287 const std::string kernel_src {
1288 #include "mul_mv_f16_f32.cl.h"
1289 };
1290#else
1291 const std::string kernel_src = read_file("mul_mv_f16_f32.cl");
1292#endif
1293 backend_ctx->program_mul_mv_f16_f32 =
1294 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1295
1296 CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32, "kernel_mul_mat_f16_f32", &err), err));
1297 GGML_LOG_CONT(".");
1298 }
1299
1300 // mul_mv_f32_f32
1301 {
1302#ifdef GGML_OPENCL_EMBED_KERNELS
1303 const std::string kernel_src {
1304 #include "mul_mv_f32_f32.cl.h"
1305 };
1306#else
1307 const std::string kernel_src = read_file("mul_mv_f32_f32.cl");
1308#endif
1309 backend_ctx->program_mul_mv_f32_f32 =
1310 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1311
1312 CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program_mul_mv_f32_f32, "kernel_mul_mat_f32_f32", &err), err));
1313 GGML_LOG_CONT(".");
1314 }
1315
1316 // mul_mat_f16_f32_tiled
1317 {
1318#ifdef GGML_OPENCL_EMBED_KERNELS
1319 const std::string kernel_src {
1320 #include "mul_mat_f16_f32.cl.h"
1321 };
1322#else
1323 const std::string kernel_src = read_file("mul_mat_f16_f32.cl");
1324#endif
1325 backend_ctx->program_mul_mat_f16_f32_tiled =
1326 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1327
1328 CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_tiled = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_tiled, "mul_mat_f16_f32", &err), err));
1329 GGML_LOG_CONT(".");
1330 }
1331
1332 // mul_mm_f32_f32_l4_lm
1333 {
1334#ifdef GGML_OPENCL_EMBED_KERNELS
1335 const std::string kernel_src {
1336 #include "mul_mm_f32_f32_l4_lm.cl.h"
1337 };
1338#else
1339 const std::string kernel_src = read_file("mul_mm_f32_f32_l4_lm.cl");
1340#endif
1341 backend_ctx->program_mul_mm_f32_f32_l4_lm =
1342 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1343
1344 CL_CHECK((backend_ctx->kernel_mul_mm_f32_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f32_f32_l4_lm, "kernel_mul_mm_f32_f32_l4_lm", &err), err));
1345 GGML_LOG_CONT(".");
1346 }
1347
1348 // mul_mm_f16_f32_l4_lm
1349 {
1350#ifdef GGML_OPENCL_EMBED_KERNELS
1351 const std::string kernel_src {
1352 #include "mul_mm_f16_f32_l4_lm.cl.h"
1353 };
1354#else
1355 const std::string kernel_src = read_file("mul_mm_f16_f32_l4_lm.cl");
1356#endif
1357 backend_ctx->program_mul_mm_f16_f32_l4_lm =
1358 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1359
1360 CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_l4_lm, "kernel_mul_mm_f16_f32_l4_lm", &err), err));
1361 GGML_LOG_CONT(".");
1362 }
1363
1364 // mul_mm_q8_0_f32_l4_lm
1365 {
1366#ifdef GGML_OPENCL_EMBED_KERNELS
1367 const std::string kernel_src {
1368 #include "mul_mm_q8_0_f32_l4_lm.cl.h"
1369 };
1370#else
1371 const std::string kernel_src = read_file("mul_mm_q8_0_f32_l4_lm.cl");
1372#endif
1373 backend_ctx->program_mul_mm_q8_0_f32_l4_lm =
1374 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1375
1376 CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_q8_0_f32_l4_lm, "kernel_mul_mm_q8_0_f32_l4_lm", &err), err));
1377 GGML_LOG_CONT(".");
1378 }
1379
1380 // mul_mm_q6_k_f32_l4_lm
1381 {
1382#ifdef GGML_OPENCL_EMBED_KERNELS
1383 const std::string kernel_src {
1384 #include "mul_mm_q6_k_f32_l4_lm.cl.h"
1385 };
1386#else
1387 const std::string kernel_src = read_file("mul_mm_q6_k_f32_l4_lm.cl");
1388#endif
1389 cl_program prog =
1390 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1391
1392 CL_CHECK((backend_ctx->kernel_mul_mm_q6_k_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q6_k_f32_l4_lm", &err), err));
1393 CL_CHECK(clReleaseProgram(prog));
1394 GGML_LOG_CONT(".");
1395 }
1396
1397 // mul_mm_f16_f32_kq_kqv
1398 {
1399#ifdef GGML_OPENCL_EMBED_KERNELS
1400 const std::string kernel_src {
1401 #include "mul_mm_f16_f32_kq_kqv.cl.h"
1402 };
1403#else
1404 const std::string kernel_src = read_file("mul_mm_f16_f32_kq_kqv.cl");
1405#endif
1406 backend_ctx->program_mul_mm_f16_f32_kqv =
1407 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts+" -DKQV ");
1408 backend_ctx->program_mul_mm_f16_f32_kq =
1409 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1410
1411 CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_kqv = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_kqv, "mul_mm_f16_f32_kqv", &err), err));
1412 CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_kq = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_kq, "mul_mm_f16_f32_kq", &err), err));
1413 GGML_LOG_CONT(".");
1414 }
1415
1416 // mul
1417 {
1418#ifdef GGML_OPENCL_EMBED_KERNELS
1419 const std::string kernel_src {
1420 #include "mul.cl.h"
1421 };
1422#else
1423 const std::string kernel_src = read_file("mul.cl");
1424#endif
1425 backend_ctx->program_mul =
1426 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1427
1428 CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
1429 CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
1430 CL_CHECK((backend_ctx->kernel_mul_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_f16", &err), err));
1431 CL_CHECK((backend_ctx->kernel_mul_row_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row_f16", &err), err));
1432 GGML_LOG_CONT(".");
1433 }
1434
1435 // norm
1436 {
1437#ifdef GGML_OPENCL_EMBED_KERNELS
1438 const std::string kernel_src {
1439 #include "norm.cl.h"
1440 };
1441#else
1442 const std::string kernel_src = read_file("norm.cl");
1443#endif
1444 backend_ctx->program_norm =
1445 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1446
1447 CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program_norm, "kernel_norm", &err), err));
1448 CL_CHECK((backend_ctx->kernel_norm_mul_add = clCreateKernel(backend_ctx->program_norm, "kernel_norm_mul_add", &err), err));
1449 GGML_LOG_CONT(".");
1450 }
1451
1452 // relu
1453 {
1454#ifdef GGML_OPENCL_EMBED_KERNELS
1455 const std::string kernel_src {
1456 #include "relu.cl.h"
1457 };
1458#else
1459 const std::string kernel_src = read_file("relu.cl");
1460#endif
1461 backend_ctx->program_relu =
1462 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1463
1464 CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program_relu, "kernel_relu", &err), err));
1465 GGML_LOG_CONT(".");
1466 }
1467
1468 // rms_norm
1469 {
1470#ifdef GGML_OPENCL_EMBED_KERNELS
1471 const std::string kernel_src {
1472 #include "rms_norm.cl.h"
1473 };
1474#else
1475 const std::string kernel_src = read_file("rms_norm.cl");
1476#endif
1477 backend_ctx->program_rms_norm =
1478 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1479
1480 CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm", &err), err));
1481 CL_CHECK((backend_ctx->kernel_rms_norm_mul = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm_mul", &err), err));
1482 GGML_LOG_CONT(".");
1483 }
1484
1485 // rope
1486 {
1487#ifdef GGML_OPENCL_EMBED_KERNELS
1488 const std::string kernel_src {
1489 #include "rope.cl.h"
1490 };
1491#else
1492 const std::string kernel_src = read_file("rope.cl");
1493#endif
1494 backend_ctx->program_rope =
1495 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1496
1497 CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_norm_f32", &err), err));
1498 CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_norm_f16", &err), err));
1499 CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_neox_f32", &err), err));
1500 CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_neox_f16", &err), err));
1501 CL_CHECK((backend_ctx->kernel_rope_multi_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_multi_f32", &err), err));
1502 CL_CHECK((backend_ctx->kernel_rope_multi_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_multi_f16", &err), err));
1503 CL_CHECK((backend_ctx->kernel_rope_vision_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_vision_f32", &err), err));
1504 CL_CHECK((backend_ctx->kernel_rope_vision_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_vision_f16", &err), err));
1505 GGML_LOG_CONT(".");
1506 }
1507
1508 // scale
1509 {
1510#ifdef GGML_OPENCL_EMBED_KERNELS
1511 const std::string kernel_src {
1512 #include "scale.cl.h"
1513 };
1514#else
1515 const std::string kernel_src = read_file("scale.cl");
1516#endif
1517 cl_program prog =
1518 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1519
1520 CL_CHECK((backend_ctx->kernel_scale_f32 = clCreateKernel(prog, "kernel_scale_f32", &err), err));
1521 CL_CHECK((backend_ctx->kernel_scale_f32_4 = clCreateKernel(prog, "kernel_scale_f32_4", &err), err));
1522 CL_CHECK(clReleaseProgram(prog));
1523 GGML_LOG_CONT(".");
1524 }
1525
1526 // silu
1527 {
1528#ifdef GGML_OPENCL_EMBED_KERNELS
1529 const std::string kernel_src {
1530 #include "silu.cl.h"
1531 };
1532#else
1533 const std::string kernel_src = read_file("silu.cl");
1534#endif
1535 backend_ctx->program_silu =
1536 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1537
1538 CL_CHECK((backend_ctx->kernel_silu = clCreateKernel(backend_ctx->program_silu, "kernel_silu", &err), err));
1539 CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program_silu, "kernel_silu_4", &err), err));
1540 GGML_LOG_CONT(".");
1541 }
1542
1543 // softmax_f32
1544 {
1545#ifdef GGML_OPENCL_EMBED_KERNELS
1546 const std::string kernel_src {
1547 #include "softmax_f32.cl.h"
1548 };
1549#else
1550 const std::string kernel_src = read_file("softmax_f32.cl");
1551#endif
1552 backend_ctx->program_softmax_f32 =
1553 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1554
1555 CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program_softmax_f32, "kernel_soft_max", &err), err));
1556 GGML_LOG_CONT(".");
1557 }
1558
1559 // softmax_f16
1560 {
1561#ifdef GGML_OPENCL_EMBED_KERNELS
1562 const std::string kernel_src {
1563 #include "softmax_f16.cl.h"
1564 };
1565#else
1566 const std::string kernel_src = read_file("softmax_f16.cl");
1567#endif
1568 backend_ctx->program_softmax_f16 =
1569 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1570
1571 CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program_softmax_f16, "kernel_soft_max_f16", &err), err));
1572 GGML_LOG_CONT(".");
1573 }
1574
1575 // softmax_4_f32
1576 {
1577#ifdef GGML_OPENCL_EMBED_KERNELS
1578 const std::string kernel_src {
1579 #include "softmax_4_f32.cl.h"
1580 };
1581#else
1582 const std::string kernel_src = read_file("softmax_4_f32.cl");
1583#endif
1584 backend_ctx->program_softmax_4_f32 =
1585 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1586
1587 CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program_softmax_4_f32, "kernel_soft_max_4", &err), err));
1588 GGML_LOG_CONT(".");
1589 }
1590
1591 // softmax_4_f16
1592 {
1593#ifdef GGML_OPENCL_EMBED_KERNELS
1594 const std::string kernel_src {
1595 #include "softmax_4_f16.cl.h"
1596 };
1597#else
1598 const std::string kernel_src = read_file("softmax_4_f16.cl");
1599#endif
1600 backend_ctx->program_softmax_4_f16 =
1601 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1602
1603 CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program_softmax_4_f16, "kernel_soft_max_4_f16", &err), err));
1604 GGML_LOG_CONT(".");
1605 }
1606
1607 // flash_attn
1608 {
1609 #ifdef GGML_OPENCL_EMBED_KERNELS
1610 const std::string kernel_src_f16 {
1611 #include "flash_attn_f16.cl.h"
1612 };
1613 const std::string kernel_src_f32 {
1614 #include "flash_attn_f32.cl.h"
1615 };
1616 const std::string kernel_src_f32_f16 {
1617 #include "flash_attn_f32_f16.cl.h"
1618 };
1619 #else
1620 const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
1621 const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
1622 const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
1623 #endif
1624
1625 if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
1626 const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
1627 { 40, 40, 32, 32}, { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32},
1628 {112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
1629 {192, 192, 16, 16}, {256, 256, 16, 16},
1630 };
1631
1632 for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
1633 const int dk = fa_dims[i].dk;
1634 const int dv = fa_dims[i].dv;
1635 const int bm = fa_dims[i].bm;
1636 const int bn = fa_dims[i].bn;
1637 std::string OPTS = compile_opts +
1638 " -D DK=" + std::to_string(dk) +
1639 " -D DV=" + std::to_string(dv) +
1640 " -D BLOCK_M=" + std::to_string(bm) +
1641 " -D BLOCK_N=" + std::to_string(bn);
1642
1643 cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
1644 cl_kernel k_f16, k_f16_q1;
1645 CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
1646 CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
1647 backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
1648 backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
1649 CL_CHECK(clReleaseProgram(prog_f16));
1650
1651 cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
1652 cl_kernel k_f32, k_f32_q1;
1653 CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
1654 CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
1655 backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
1656 backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
1657 CL_CHECK(clReleaseProgram(prog_f32));
1658
1659 cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
1660 cl_kernel k_f32_f16, k_f32_f16_q1;
1661 CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
1662 CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
1663 backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
1664 backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
1665 CL_CHECK(clReleaseProgram(prog_f32_f16));
1666
1667 backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
1668 backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
1669 }
1670 GGML_LOG_CONT(".");
1671 }
1672 }
1673
1674 // argsort
1675 {
1676#ifdef GGML_OPENCL_EMBED_KERNELS
1677 const std::string kernel_src {
1678 #include "argsort.cl.h"
1679 };
1680#else
1681 const std::string kernel_src = read_file("argsort.cl");
1682#endif
1683 backend_ctx->program_argsort_f32_i32 =
1684 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1685
1686 CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
1687 GGML_LOG_CONT(".");
1688 }
1689
1690 // div
1691 {
1692#ifdef GGML_OPENCL_EMBED_KERNELS
1693 const std::string kernel_src {
1694 #include "div.cl.h"
1695 };
1696#else
1697 const std::string kernel_src = read_file("div.cl");
1698#endif
1699 std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
1700 " -cl-mad-enable -cl-finite-math-only ";
1701
1702 backend_ctx->program_div =
1703 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1704
1705 CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
1706 CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
1707 CL_CHECK((backend_ctx->kernel_div_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_f16", &err), err));
1708 CL_CHECK((backend_ctx->kernel_div_row_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_row_f16", &err), err));
1709 GGML_LOG_CONT(".");
1710 }
1711
1712 // sqr
1713 {
1714#ifdef GGML_OPENCL_EMBED_KERNELS
1715 const std::string kernel_src {
1716 #include "sqr.cl.h"
1717 };
1718#else
1719 const std::string kernel_src = read_file("sqr.cl");
1720#endif
1721 cl_program prog =
1722 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1723
1724 CL_CHECK((backend_ctx->kernel_sqr_cont_f32 = clCreateKernel(prog, "kernel_sqr_cont_f32", &err), err));
1725 CL_CHECK((backend_ctx->kernel_sqr_cont_f32_4 = clCreateKernel(prog, "kernel_sqr_cont_f32_4", &err), err));
1726 CL_CHECK((backend_ctx->kernel_sqr_cont_f16 = clCreateKernel(prog, "kernel_sqr_cont_f16", &err), err));
1727 CL_CHECK((backend_ctx->kernel_sqr_cont_f16_4 = clCreateKernel(prog, "kernel_sqr_cont_f16_4", &err), err));
1728
1729 CL_CHECK(clReleaseProgram(prog));
1730 GGML_LOG_CONT(".");
1731 }
1732
1733 // sqrt
1734 {
1735#ifdef GGML_OPENCL_EMBED_KERNELS
1736 const std::string kernel_src {
1737 #include "sqrt.cl.h"
1738 };
1739#else
1740 const std::string kernel_src = read_file("sqrt.cl");
1741#endif
1742 cl_program prog =
1743 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1744
1745 CL_CHECK((backend_ctx->kernel_sqrt_cont_f32 = clCreateKernel(prog, "kernel_sqrt_cont_f32", &err), err));
1746 CL_CHECK((backend_ctx->kernel_sqrt_cont_f32_4 = clCreateKernel(prog, "kernel_sqrt_cont_f32_4", &err), err));
1747 CL_CHECK((backend_ctx->kernel_sqrt_cont_f16 = clCreateKernel(prog, "kernel_sqrt_cont_f16", &err), err));
1748 CL_CHECK((backend_ctx->kernel_sqrt_cont_f16_4 = clCreateKernel(prog, "kernel_sqrt_cont_f16_4", &err), err));
1749
1750 CL_CHECK(clReleaseProgram(prog));
1751 GGML_LOG_CONT(".");
1752 }
1753
1754 // mean
1755 {
1756#ifdef GGML_OPENCL_EMBED_KERNELS
1757 const std::string kernel_src {
1758 #include "mean.cl.h"
1759 };
1760#else
1761 const std::string kernel_src = read_file("mean.cl");
1762#endif
1763 cl_program prog =
1764 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1765
1766 CL_CHECK((backend_ctx->kernel_mean_f32 = clCreateKernel(prog, "kernel_mean_f32", &err), err));
1767
1768 CL_CHECK(clReleaseProgram(prog));
1769 GGML_LOG_CONT(".");
1770 }
1771
1772 // sub
1773 {
1774#ifdef GGML_OPENCL_EMBED_KERNELS
1775 const std::string kernel_src {
1776 #include "sub.cl.h"
1777 };
1778#else
1779 const std::string kernel_src = read_file("sub.cl");
1780#endif
1781 backend_ctx->program_sub =
1782 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1783
1784 CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
1785 CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
1786 CL_CHECK((backend_ctx->kernel_sub_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_f16", &err), err));
1787 CL_CHECK((backend_ctx->kernel_sub_row_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row_f16", &err), err));
1788 GGML_LOG_CONT(".");
1789 }
1790
1791 // sum_rows
1792 {
1793#ifdef GGML_OPENCL_EMBED_KERNELS
1794 const std::string kernel_src {
1795 #include "sum_rows.cl.h"
1796 };
1797#else
1798 const std::string kernel_src = read_file("sum_rows.cl");
1799#endif
1800 backend_ctx->program_sum_rows_f32 =
1801 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1802
1803 CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
1804 GGML_LOG_CONT(".");
1805 }
1806
1807 // sigmoid
1808 {
1809#ifdef GGML_OPENCL_EMBED_KERNELS
1810 const std::string kernel_src {
1811 #include "sigmoid.cl.h"
1812 };
1813#else
1814 const std::string kernel_src = read_file("sigmoid.cl");
1815#endif
1816 backend_ctx->program_sigmoid =
1817 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1818
1819 CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
1820 CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
1821 GGML_LOG_CONT(".");
1822 }
1823
1824 // group_norm
1825 {
1826#ifdef GGML_OPENCL_EMBED_KERNELS
1827 const std::string kernel_src {
1828 #include "group_norm.cl.h"
1829 };
1830#else
1831 const std::string kernel_src = read_file("group_norm.cl");
1832#endif
1833 backend_ctx->program_group_norm =
1834 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1835
1836 CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
1837 CL_CHECK((backend_ctx->kernel_group_norm_mul_add = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm_mul_add", &err), err));
1838 GGML_LOG_CONT(".");
1839 }
1840
1841 // repeat
1842 {
1843#ifdef GGML_OPENCL_EMBED_KERNELS
1844 const std::string kernel_src {
1845 #include "repeat.cl.h"
1846 };
1847#else
1848 const std::string kernel_src = read_file("repeat.cl");
1849#endif
1850 cl_program prog =
1851 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1852 CL_CHECK((backend_ctx->kernel_repeat_f32 = clCreateKernel(prog, "kernel_repeat_f32", &err), err));
1853 CL_CHECK(clReleaseProgram(prog));
1854 GGML_LOG_CONT(".");
1855 }
1856
1857 // pad
1858 {
1859#ifdef GGML_OPENCL_EMBED_KERNELS
1860 const std::string kernel_src {
1861 #include "pad.cl.h"
1862 };
1863#else
1864 const std::string kernel_src = read_file("pad.cl");
1865#endif
1866 if (!kernel_src.empty()) {
1867 backend_ctx->program_pad =
1868 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1869 CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err));
1870 GGML_LOG_CONT(".");
1871 } else {
1872 GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n");
1873 backend_ctx->program_pad = nullptr;
1874 backend_ctx->kernel_pad = nullptr;
1875 }
1876 }
1877
1878 // tanh
1879 {
1880#ifdef GGML_OPENCL_EMBED_KERNELS
1881 const std::string kernel_src {
1882 #include "tanh.cl.h"
1883 };
1884#else
1885 const std::string kernel_src = read_file("tanh.cl");
1886#endif
1887 cl_program prog =
1888 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1889 CL_CHECK((backend_ctx->kernel_tanh_f32 = clCreateKernel(prog, "kernel_tanh_f32", &err), err));
1890 CL_CHECK((backend_ctx->kernel_tanh_f32_4 = clCreateKernel(prog, "kernel_tanh_f32_4", &err), err));
1891 CL_CHECK((backend_ctx->kernel_tanh_f32_nc = clCreateKernel(prog, "kernel_tanh_f32_nc", &err), err));
1892 CL_CHECK((backend_ctx->kernel_tanh_f16 = clCreateKernel(prog, "kernel_tanh_f16", &err), err));
1893 CL_CHECK((backend_ctx->kernel_tanh_f16_4 = clCreateKernel(prog, "kernel_tanh_f16_4", &err), err));
1894 CL_CHECK((backend_ctx->kernel_tanh_f16_nc = clCreateKernel(prog, "kernel_tanh_f16_nc", &err), err));
1895 CL_CHECK(clReleaseProgram(prog));
1896 GGML_LOG_CONT(".");
1897 }
1898
1899 // expm1
1900 {
1901#ifdef GGML_OPENCL_EMBED_KERNELS
1902 const std::string kernel_src {
1903 #include "expm1.cl.h"
1904 };
1905#else
1906 const std::string kernel_src = read_file("expm1.cl");
1907#endif
1908 cl_program prog;
1909 if (!kernel_src.empty()) {
1910 prog =
1911 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1912 CL_CHECK((backend_ctx->kernel_expm1_f32_nd = clCreateKernel(prog, "kernel_expm1_f32_nd", &err), err));
1913 CL_CHECK((backend_ctx->kernel_expm1_f16_nd = clCreateKernel(prog, "kernel_expm1_f16_nd", &err), err));
1914 GGML_LOG_CONT(".");
1915 } else {
1916 GGML_LOG_WARN("ggml_opencl: expm1 kernel source not found or empty. Expm1 operation will not be available.\n");
1917 prog = nullptr;
1918 backend_ctx->kernel_expm1_f32_nd = nullptr;
1919 backend_ctx->kernel_expm1_f16_nd = nullptr;
1920 }
1921 CL_CHECK(clReleaseProgram(prog));
1922 }
1923
1924 // softplus
1925 {
1926#ifdef GGML_OPENCL_EMBED_KERNELS
1927 const std::string kernel_src {
1928 #include "softplus.cl.h"
1929 };
1930#else
1931 const std::string kernel_src = read_file("softplus.cl");
1932#endif
1933 cl_program prog;
1934 if (!kernel_src.empty()) {
1935 prog =
1936 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1937 CL_CHECK((backend_ctx->kernel_softplus_f32_nd = clCreateKernel(prog, "kernel_softplus_f32_nd", &err), err));
1938 CL_CHECK((backend_ctx->kernel_softplus_f16_nd = clCreateKernel(prog, "kernel_softplus_f16_nd", &err), err));
1939 GGML_LOG_CONT(".");
1940 } else {
1941 GGML_LOG_WARN("ggml_opencl: softplus kernel source not found or empty. Softplus operation will not be available.\n");
1942 prog = nullptr;
1943 backend_ctx->kernel_softplus_f32_nd = nullptr;
1944 backend_ctx->kernel_softplus_f16_nd = nullptr;
1945 }
1946 CL_CHECK(clReleaseProgram(prog));
1947 }
1948
1949 // upscale
1950 {
1951#ifdef GGML_OPENCL_EMBED_KERNELS
1952 const std::string kernel_src {
1953 #include "upscale.cl.h"
1954 };
1955#else
1956 const std::string kernel_src = read_file("upscale.cl");
1957#endif
1958 if (!kernel_src.empty()) {
1959 backend_ctx->program_upscale =
1960 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1961 CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
1962 if (backend_ctx->program_upscale) {
1963 cl_int err_bilinear;
1964 backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
1965 if (err_bilinear != CL_SUCCESS) {
1966 GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
1967 backend_ctx->kernel_upscale_bilinear = nullptr;
1968 }
1969 } else {
1970 backend_ctx->kernel_upscale_bilinear = nullptr;
1971 }
1972 GGML_LOG_CONT(".");
1973 } else {
1974 GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n");
1975 backend_ctx->program_upscale = nullptr;
1976 backend_ctx->kernel_upscale = nullptr;
1977 backend_ctx->kernel_upscale_bilinear = nullptr;
1978 }
1979 }
1980
1981 // concat
1982 {
1983#ifdef GGML_OPENCL_EMBED_KERNELS
1984 const std::string kernel_src {
1985 #include "concat.cl.h"
1986 };
1987#else
1988 const std::string kernel_src = read_file("concat.cl");
1989#endif
1990 cl_program prog =
1991 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1992 CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
1993 CL_CHECK(clReleaseProgram(prog));
1994 GGML_LOG_CONT(".");
1995 }
1996
1997 // timestep_embedding
1998 {
1999#ifdef GGML_OPENCL_EMBED_KERNELS
2000 const std::string kernel_src {
2001 #include "tsembd.cl.h"
2002 };
2003#else
2004
2005 const std::string kernel_src = read_file("tsembd.cl");
2006#endif
2007 if (!kernel_src.empty()) {
2008 backend_ctx->program_tsembd =
2009 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2010 CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
2011 GGML_LOG_CONT(".");
2012 } else {
2013 GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
2014 backend_ctx->program_tsembd = nullptr;
2015 backend_ctx->kernel_timestep_embedding = nullptr;
2016 }
2017 }
2018
2019 // set_rows
2020 {
2021#ifdef GGML_OPENCL_EMBED_KERNELS
2022 const std::string kernel_src {
2023 #include "set_rows.cl.h"
2024 };
2025#else
2026 const std::string kernel_src = read_file("set_rows.cl");
2027#endif
2028 backend_ctx->program_set_rows =
2029 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2030
2031 CL_CHECK((backend_ctx->kernel_set_rows_f32_i64 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32_i64", &err), err));
2032 CL_CHECK((backend_ctx->kernel_set_rows_f32_i32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32_i32", &err), err));
2033 CL_CHECK((backend_ctx->kernel_set_rows_f16_i64 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16_i64", &err), err));
2034 CL_CHECK((backend_ctx->kernel_set_rows_f16_i32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16_i32", &err), err));
2035 GGML_LOG_CONT(".");
2036 }
2037
2038 // conv2d
2039 {
2040 #ifdef GGML_OPENCL_EMBED_KERNELS
2041 const std::string kernel_src {
2042 #include "conv2d.cl.h"
2043 };
2044 const std::string kernel_src_f16_f32 {
2045 #include "conv2d_f16_f32.cl.h"
2046 };
2047 #else
2048 const std::string kernel_src = read_file("conv2d.cl");
2049 const std::string kernel_src_f16_f32 = read_file("conv2d_f16_f32.cl");
2050 #endif
2051 if (!kernel_src.empty()) {
2052 backend_ctx->program_conv_2d_f16 =
2053 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), (std::string(compile_opts) + " -DUSE_FP16=1").c_str());
2054 CL_CHECK((backend_ctx->kernel_conv_2d_f16 = clCreateKernel(backend_ctx->program_conv_2d_f16, "kernel_conv_2d", &err), err));
2055 GGML_LOG_CONT(".");
2056 backend_ctx->program_conv_2d_f32 =
2057 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2058 CL_CHECK((backend_ctx->kernel_conv_2d_f32 = clCreateKernel(backend_ctx->program_conv_2d_f32, "kernel_conv_2d", &err), err));
2059 GGML_LOG_CONT(".");
2060 } else {
2061 GGML_LOG_WARN("ggml_opencl: conv2d kernel source not found or empty. This op will not be available.\n");
2062 backend_ctx->program_conv_2d_f16 = nullptr;
2063 backend_ctx->kernel_conv_2d_f16 = nullptr;
2064 backend_ctx->program_conv_2d_f32 = nullptr;
2065 backend_ctx->kernel_conv_2d_f32 = nullptr;
2066 }
2067 if (!kernel_src_f16_f32.empty()) {
2068 backend_ctx->program_conv_2d_f16_f32 =
2069 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16_f32.c_str(), compile_opts);
2070 CL_CHECK((backend_ctx->kernel_conv_2d_f16_f32 = clCreateKernel(backend_ctx->program_conv_2d_f16_f32, "kernel_conv_2d", &err), err));
2071 GGML_LOG_CONT(".");
2072 } else {
2073 GGML_LOG_WARN("ggml_opencl: conv2d_f16_f32 kernel source not found or empty. This op will not be available.\n");
2074 backend_ctx->program_conv_2d_f16_f32 = nullptr;
2075 backend_ctx->kernel_conv_2d_f16_f32 = nullptr;
2076 }
2077 }
2078
2079 // ssm_conv
2080 {
2081#ifdef GGML_OPENCL_EMBED_KERNELS
2082 const std::string kernel_src {
2083 #include "ssm_conv.cl.h"
2084 };
2085#else
2086 const std::string kernel_src = read_file("ssm_conv.cl");
2087#endif
2088 cl_program prog =
2089 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2090
2091 CL_CHECK((backend_ctx->kernel_ssm_conv_f32_f32 = clCreateKernel(prog, "kernel_ssm_conv_f32_f32", &err), err));
2092 CL_CHECK((backend_ctx->kernel_ssm_conv_f32_f32_4 = clCreateKernel(prog, "kernel_ssm_conv_f32_f32_4", &err), err));
2093 CL_CHECK(clReleaseProgram(prog));
2094 GGML_LOG_CONT(".");
2095 }
2096
2097 // mul_mv_id_q4_0_f32_8x_flat
2098 {
2099#ifdef GGML_OPENCL_EMBED_KERNELS
2100 const std::string kernel_src {
2101 #include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
2102 };
2103#else
2104 const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
2105#endif
2106 backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
2107 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2108
2109 CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
2110 GGML_LOG_CONT(".");
2111 }
2112
2113 // mul_mv_id_q8_0_f32
2114 {
2115#ifdef GGML_OPENCL_EMBED_KERNELS
2116 const std::string kernel_src {
2117 #include "mul_mv_id_q8_0_f32.cl.h"
2118 };
2119#else
2120 const std::string kernel_src = read_file("mul_mv_id_q8_0_f32.cl");
2121#endif
2122 backend_ctx->program_mul_mv_id_q8_0_f32 =
2123 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2124
2125 CL_CHECK((backend_ctx->kernel_mul_mv_id_q8_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_id_q8_0_f32, "kernel_mul_mv_id_q8_0_f32", &err), err));
2126 GGML_LOG_CONT(".");
2127 }
2128
2129 // mul_mv_id_q8_0_f32_flat
2130 {
2131#ifdef GGML_OPENCL_EMBED_KERNELS
2132 const std::string kernel_src {
2133 #include "mul_mv_id_q8_0_f32_flat.cl.h"
2134 };
2135#else
2136 const std::string kernel_src = read_file("mul_mv_id_q8_0_f32_flat.cl");
2137#endif
2138 backend_ctx->program_mul_mv_id_q8_0_f32_flat =
2139 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2140
2141 CL_CHECK((backend_ctx->kernel_mul_mv_id_q8_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q8_0_f32_flat, "kernel_mul_mv_id_q8_0_f32_flat", &err), err));
2142 GGML_LOG_CONT(".");
2143 }
2144
2145 // mul_mv_id_mxfp4_f32
2146 {
2147#ifdef GGML_OPENCL_EMBED_KERNELS
2148 const std::string kernel_src {
2149 #include "mul_mv_id_mxfp4_f32.cl.h"
2150 };
2151#else
2152 const std::string kernel_src = read_file("mul_mv_id_mxfp4_f32.cl");
2153#endif
2154 backend_ctx->program_mul_mv_id_mxfp4_f32 =
2155 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2156
2157 CL_CHECK((backend_ctx->kernel_mul_mv_id_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_id_mxfp4_f32, "kernel_mul_mv_id_mxfp4_f32", &err), err));
2158 GGML_LOG_CONT(".");
2159 }
2160
2161 // mul_mv_id_mxfp4_f32_flat
2162 {
2163#ifdef GGML_OPENCL_EMBED_KERNELS
2164 const std::string kernel_src {
2165 #include "mul_mv_id_mxfp4_f32_flat.cl.h"
2166 };
2167#else
2168 const std::string kernel_src = read_file("mul_mv_id_mxfp4_f32_flat.cl");
2169#endif
2170 backend_ctx->program_mul_mv_id_mxfp4_f32_flat =
2171 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2172
2173 CL_CHECK((backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_id_mxfp4_f32_flat, "kernel_mul_mv_id_mxfp4_f32_flat", &err), err));
2174 GGML_LOG_CONT(".");
2175 }
2176
2177 // Adreno kernels
2178#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
2179 // transpose
2180 {
2181#ifdef GGML_OPENCL_EMBED_KERNELS
2182 const std::string kernel_src {
2183 #include "transpose.cl.h"
2184 };
2185#else
2186 const std::string kernel_src = read_file("transpose.cl");
2187#endif
2188 backend_ctx->program_transpose =
2189 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
2190
2191 CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
2192 CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
2193 CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
2194 CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
2195 CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
2196 GGML_LOG_CONT(".");
2197 }
2198
2199 // gemv_noshuffle_general
2200 {
2201 std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
2202 " -cl-mad-enable "
2203 " -DSIMDGROUP_WIDTH=" +
2204 std::to_string(backend_ctx->adreno_wave_size);
2205 if (backend_ctx->has_vector_subgroup_broadcast) {
2206 CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
2207 }
2208
2209#ifdef GGML_OPENCL_EMBED_KERNELS
2210 const std::string kernel_src_CL_gemv_general {
2211 #include "gemv_noshuffle_general.cl.h"
2212 };
2213#else
2214 const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general.cl");
2215#endif
2216
2217 backend_ctx->program_CL_gemv_general = build_program_from_source(
2218 backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
2219
2220 CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
2221 GGML_LOG_CONT(".");
2222 }
2223
2224 // gemv_noshuffle
2225 {
2226 // Gemv 2048, 16384
2227 std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
2228 " -cl-mad-enable "
2229 " -DLINE_STRIDE_A=2048 "
2230 " -DBLOCK_STRIDE_A=16384 "
2231 " -DSIMDGROUP_WIDTH=" +
2232 std::to_string(backend_ctx->adreno_wave_size);
2233 if (backend_ctx->has_vector_subgroup_broadcast) {
2234 CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
2235 }
2236
2237#ifdef GGML_OPENCL_EMBED_KERNELS
2238 const std::string kernel_src_CL_gemv {
2239 #include "gemv_noshuffle.cl.h"
2240 };
2241#else
2242 const std::string kernel_src_CL_gemv = read_file("gemv_noshuffle.cl");
2243#endif
2244
2245 backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
2246 backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
2247 CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
2248 GGML_LOG_CONT(".");
2249
2250 // Gemv 2048, 16384
2251 CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
2252 " -cl-mad-enable "
2253 " -DLINE_STRIDE_A=2048 "
2254 " -DBLOCK_STRIDE_A=16384 "
2255 " -DSIMDGROUP_WIDTH=" +
2256 std::to_string(backend_ctx->adreno_wave_size);
2257 if (backend_ctx->has_vector_subgroup_broadcast) {
2258 CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
2259 }
2260
2261 backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
2262 backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
2263 CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
2264 GGML_LOG_CONT(".");
2265
2266 // Gemv 5504, 44032
2267 CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
2268 " -cl-mad-enable "
2269 " -DLINE_STRIDE_A=5504 "
2270 " -DBLOCK_STRIDE_A=44032 "
2271 " -DSIMDGROUP_WIDTH=" +
2272 std::to_string(backend_ctx->adreno_wave_size);
2273 if (backend_ctx->has_vector_subgroup_broadcast) {
2274 CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
2275 }
2276
2277 backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
2278 backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
2279 CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
2280 GGML_LOG_CONT(".");
2281
2282 // Gemv 16000, 128000
2283 CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
2284 " -cl-mad-enable "
2285 " -DLINE_STRIDE_A=16000 "
2286 " -DBLOCK_STRIDE_A=128000 "
2287 " -DSIMDGROUP_WIDTH=" +
2288 std::to_string(backend_ctx->adreno_wave_size);
2289
2290 if (backend_ctx->has_vector_subgroup_broadcast) {
2291 CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
2292 }
2293
2294 backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(
2295 backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
2296 CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
2297 GGML_LOG_CONT(".");
2298 }
2299
2300 // mul_mat_Ab_Bi_8x4
2301 {
2302#ifdef GGML_OPENCL_EMBED_KERNELS
2303 const std::string kernel_src_CL_gemm {
2304 #include "mul_mat_Ab_Bi_8x4.cl.h"
2305 };
2306#else
2307 const std::string kernel_src_CL_gemm = read_file("mul_mat_Ab_Bi_8x4.cl");
2308#endif
2309 backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_CL_gemm.c_str(), compile_opts);
2310 CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
2311 GGML_LOG_CONT(".");
2312 }
2313
2314 // mul_mm_q8_0_f32_8x4
2315 {
2316#ifdef GGML_OPENCL_EMBED_KERNELS
2317 const std::string kernel_src_q8_8x4_gemm {
2318 #include "mul_mm_q8_0_f32_8x4.cl.h"
2319 };
2320#else
2321 const std::string kernel_src_q8_8x4_gemm = read_file("mul_mm_q8_0_f32_8x4.cl");
2322#endif
2323 backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_q8_8x4_gemm.c_str(), compile_opts);
2324 CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mm_q8_0_f32_8x4", &err), err));
2325 GGML_LOG_CONT(".");
2326 }
2327
2328 // gemv_noshuffle_general_q8_0_f32
2329 {
2330 std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
2331 " -cl-mad-enable "
2332 " -DSIMDGROUP_WIDTH=" +
2333 std::to_string(backend_ctx->adreno_wave_size);
2334 if (backend_ctx->has_vector_subgroup_broadcast) {
2335 CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
2336 }
2337
2338#ifdef GGML_OPENCL_EMBED_KERNELS
2339 const std::string kernel_src_CL_gemv_general {
2340 #include "gemv_noshuffle_general_q8_0_f32.cl.h"
2341 };
2342#else
2343 const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general_q8_0_f32.cl");
2344#endif
2345
2346 cl_program prog = build_program_from_source(
2347 backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
2348
2349 CL_CHECK((backend_ctx->CL_mul_mat_vec_q8_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle", &err), err));
2350 CL_CHECK(clReleaseProgram(prog));
2351 GGML_LOG_CONT(".");
2352 }
2353
2354 std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
2355 " -cl-mad-enable "
2356 " -cl-fast-relaxed-math";
2357
2358 // gemv_moe_mxfp4_f32
2359 {
2360#ifdef GGML_OPENCL_EMBED_KERNELS
2361 const std::string kernel_src {
2362 #include "gemv_moe_mxfp4_f32.cl.h"
2363 };
2364#else
2365 const std::string kernel_src = read_file("gemv_moe_mxfp4_f32.cl");
2366#endif
2367 backend_ctx->program_gemv_moe_mxfp4_f32 =
2368 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
2369
2370 CL_CHECK((backend_ctx->kernel_gemv_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemv_moe_mxfp4_f32, "kernel_gemv_moe_mxfp4_f32", &err), err));
2371 GGML_LOG_CONT(".");
2372 }
2373
2374 // gemm_moe_mxfp4_f32
2375 {
2376#ifdef GGML_OPENCL_EMBED_KERNELS
2377 const std::string kernel_src {
2378 #include "gemm_moe_mxfp4_f32.cl.h"
2379 };
2380#else
2381 const std::string kernel_src = read_file("gemm_moe_mxfp4_f32.cl");
2382#endif
2383 backend_ctx->program_gemm_moe_mxfp4_f32 =
2384 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
2385
2386 CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemm_moe_mxfp4_f32, "kernel_gemm_moe_mxfp4_f32", &err), err));
2387 GGML_LOG_CONT(".");
2388 }
2389#endif // GGML_OPENCL_USE_ADRENO_KERNELS
2390 GGML_LOG_CONT("\n");
2391}
2392
2393// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
2394// XXX static bool initialized = false;
2395// XXX static ggml_backend_opencl_context *backend_ctx = nullptr;
2396
2397static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
2398
2399namespace /* anonymous */ {
2400extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
2401}
2402
2403// Look for available and suitable devices.
2404static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_reg * reg) {
2405 std::vector<ggml_backend_device> found_devices;
2406
2407#ifdef GGML_OPENCL_PROFILING
2408 GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
2409#endif
2410
2411 struct cl_device;
2412 struct cl_platform {
2413 cl_platform_id id;
2414 unsigned number;
2415 char name[128];
2416 char vendor[128];
2417 struct cl_device * devices;
2418 unsigned n_devices;
2419 struct cl_device * default_device;
2420 };
2421
2422 struct cl_device {
2423 struct cl_platform * platform;
2424 cl_device_id id;
2425 unsigned number;
2426 cl_device_type type;
2427 char name[128];
2428 char version[128];
2429 };
2430
2431 enum { NPLAT = 16, NDEV = 16 };
2432
2433 struct cl_platform platforms[NPLAT];
2434 unsigned n_platforms = 0;
2435 struct cl_device devices[NDEV];
2436 unsigned n_devices = 0;
2437 struct cl_device * default_device = NULL;
2438 unsigned default_platform_number = 0;
2439
2440 cl_platform_id platform_ids[NPLAT];
2441 if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
2442 GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
2443 return found_devices;
2444 }
2445
2446 for (unsigned i = 0; i < n_platforms; i++) {
2447 struct cl_platform * p = &platforms[i];
2448 p->number = i;
2449 p->id = platform_ids[i];
2450 CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
2451 CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
2452
2453 cl_device_id device_ids[NDEV];
2454 cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
2455 if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
2456 p->n_devices = 0;
2457 } else {
2458 CL_CHECK(clGetDeviceIDsError);
2459 }
2460 p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
2461 p->default_device = NULL;
2462
2463 for (unsigned j = 0; j < p->n_devices; j++) {
2464 struct cl_device * d = &devices[n_devices];
2465 d->number = n_devices++;
2466 d->id = device_ids[j];
2467 d->platform = p;
2468 CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
2469 CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
2470 CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_VERSION, sizeof(d->version), &d->version, NULL));
2471
2472 if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
2473 p->default_device = d;
2474 }
2475 }
2476
2477 if (default_device == NULL && p->default_device != NULL) {
2478 default_device = p->default_device;
2479 default_platform_number = i;
2480 }
2481 }
2482
2483 if (n_devices == 0) {
2484 GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
2485 return found_devices;
2486 }
2487
2488 char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
2489 char * user_device_string = getenv("GGML_OPENCL_DEVICE");
2490 int user_platform_number = -1;
2491 int user_device_number = -1;
2492 cl_device * candidate_devices = nullptr;
2493 unsigned n_candidate_devices = 0;
2494
2495 unsigned n;
2496 if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
2497 user_platform_number = (int)n;
2498 }
2499 if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
2500 user_device_number = (int)n;
2501 }
2502 if (user_platform_number != -1 && user_device_number != -1) {
2503 cl_platform* platform = &platforms[user_platform_number];
2504 if ((unsigned)user_device_number >= platform->n_devices) {
2505 GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
2506 exit(1);
2507 }
2508 default_device = &platform->devices[user_device_number];
2509 candidate_devices = platform->devices;
2510 n_candidate_devices = platform->n_devices;
2511 } else {
2512 // Choose a platform by matching a substring.
2513 if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
2514 for (unsigned i = 0; i < n_platforms; i++) {
2515 struct cl_platform * p = &platforms[i];
2516 if (strstr(p->name, user_platform_string) != NULL ||
2517 strstr(p->vendor, user_platform_string) != NULL) {
2518 user_platform_number = (int)i;
2519 break;
2520 }
2521 }
2522 if (user_platform_number == -1) {
2523 GGML_LOG_ERROR("ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
2524 exit(1);
2525 }
2526 }
2527
2528 int platform_idx = user_platform_number != -1 ? user_platform_number : default_platform_number;
2529 struct cl_platform * p = &platforms[platform_idx];
2530 candidate_devices = p->devices;
2531 n_candidate_devices = p->n_devices;
2532 default_device = p->default_device;
2533 if (n_candidate_devices == 0) {
2534 GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
2535 exit(1);
2536 }
2537
2538 if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
2539 for (unsigned i = 0; i < n_candidate_devices; i++) {
2540 struct cl_device * d = &candidate_devices[i];
2541 if (strstr(d->name, user_device_string) != NULL) {
2542 user_device_number = d->number;
2543 break;
2544 }
2545 }
2546 if (user_device_number == -1) {
2547 GGML_LOG_ERROR("ggml_opencl: no device matching '%s' was found.\n", user_device_string);
2548 exit(1);
2549 }
2550 }
2551 if (user_device_number != -1) {
2552 candidate_devices = &devices[user_device_number];
2553 n_candidate_devices = 1;
2554 default_device = &candidate_devices[0];
2555 }
2556
2557 GGML_ASSERT(n_candidate_devices > 0);
2558
2559 if (default_device == NULL) {
2560 default_device = &candidate_devices[0];
2561 }
2562 }
2563
2564 GGML_ASSERT(n_candidate_devices != 0 && candidate_devices);
2565
2566 // Put the default device in front.
2567 for (unsigned i = 1; i < n_candidate_devices; i++) {
2568 if (&candidate_devices[i] == default_device) {
2569 std::swap(candidate_devices[0], candidate_devices[i]);
2570 default_device = &candidate_devices[0];
2571 break;
2572 }
2573 }
2574
2575 GGML_LOG_INFO("ggml_opencl: selected platform: '%s'\n", default_device->platform->name);
2576
2577 std::vector<cl_device_id> device_ids;
2578 for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
2579 device_ids.push_back(dev->id);
2580 }
2581
2582 cl_int err;
2583 cl_context shared_context;
2584 cl_context_properties properties[] = { (intptr_t) CL_CONTEXT_PLATFORM, (intptr_t) default_device->platform->id, 0 };
2585
2586 CL_CHECK(
2587 (shared_context = clCreateContext(properties, device_ids.size(), device_ids.data(), NULL, NULL, &err), err));
2588
2589 for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
2590 GGML_LOG_INFO("\nggml_opencl: device: '%s (%s)'\n", dev->name, dev->version);
2591
2592 auto dev_ctx = std::unique_ptr<ggml_backend_opencl_device_context>(new ggml_backend_opencl_device_context{
2593 /*.platform =*/dev->platform->id,
2594 /*.platform_nane =*/dev->platform->name,
2595 /*.device =*/dev->id,
2596 /*.device_name =*/dev->name,
2597 /*.device_type =*/dev->type,
2598 /*.device_version =*/dev->version,
2599 /*.backend_ctx =*/nullptr,
2600 /*.buffer_type =*/{},
2601 /*.context =*/shared_context,
2602 });
2603
2604 found_devices.push_back(ggml_backend_device{
2605 /* .iface = */ ggml_backend_opencl_device_i,
2606 /* .reg = */ reg,
2607 /* .context = */ dev_ctx.get(),
2608 });
2609
2610 if (!ggml_cl2_init(&found_devices.back())) {
2611 found_devices.pop_back();
2612 GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
2613 continue;
2614 }
2615
2616 dev_ctx.release();
2617 }
2618
2619 if (found_devices.size()) {
2620 auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(found_devices.front().context);
2621 GGML_LOG_INFO("ggml_opencl: default device: '%s (%s)'\n", dev_ctx->device_name.c_str(),
2622 dev_ctx->device_version.c_str());
2623
2624 if (dev_ctx->device_type != CL_DEVICE_TYPE_GPU) {
2625 GGML_LOG_WARN("ggml_opencl: warning, the default device is not a GPU: '%s'.\n",
2626 dev_ctx->device_name.c_str());
2627 }
2628 }
2629
2630 return found_devices;
2631}
2632
2633// Initialize device if it is supported (returns nullptr if it is not).
2634static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
2635 GGML_ASSERT(dev);
2636 GGML_ASSERT(dev->context);
2637
2638 ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
2639 GGML_ASSERT(dev_ctx->platform);
2640 GGML_ASSERT(dev_ctx->device);
2641
2642 if (dev_ctx->backend_ctx) {
2643 return dev_ctx->backend_ctx;
2644 }
2645
2646 auto backend_ctx = std::make_unique<ggml_backend_opencl_context>();
2647 backend_ctx->device = dev_ctx->device;
2648 backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
2649
2650 // ref_count get increased in ggml_backend_opencl_device_init
2651 // This function is also used to retrieve backend context, so we don't want
2652 // to increase ref_count for each call. We only want to increase ref_count
2653 // when the associated device is initialized
2654 backend_ctx->ref_count = 0;
2655
2656 if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
2657 strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
2658 strstr(dev_ctx->device_version.c_str(), "Adreno")) {
2659 backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
2660 // Usually device version contains the detailed device name
2661 backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
2662 if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
2663 backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
2664 }
2665
2666 // Use wave size of 64 for all Adreno GPUs.
2667 backend_ctx->adreno_wave_size = 64;
2668 } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
2669 backend_ctx->gpu_family = GPU_FAMILY::INTEL;
2670 } else {
2671 GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
2672 backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
2673 return nullptr;
2674 }
2675
2676#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
2677 if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
2678 GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
2679 "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
2680 return nullptr;
2681 }
2682#endif
2683
2684 // Populate backend device name
2685 backend_ctx->device_name = dev_ctx->device_name;
2686
2687 // A local ref of cl_device_id for convenience
2688 cl_device_id device = backend_ctx->device;
2689
2690 ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
2691
2692 // Check device OpenCL version, OpenCL 2.0 or above is required
2693 ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
2694 if (opencl_c_version.major < 2) {
2695 GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
2696 return nullptr;
2697 }
2698
2699 // Check driver version
2700 size_t driver_version_str_size;
2701 clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, NULL, &driver_version_str_size);
2702 char *driver_version = (char *)alloca(driver_version_str_size + 1);
2703 clGetDeviceInfo(device, CL_DRIVER_VERSION, driver_version_str_size, driver_version, NULL);
2704 driver_version[driver_version_str_size] = '\0';
2705 GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
2706 backend_ctx->driver_version = driver_version;
2707
2708 backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
2709 backend_ctx->has_vector_subgroup_broadcast =
2710 (backend_ctx->adreno_cl_compiler_version.type == E031 && backend_ctx->adreno_cl_compiler_version.major >= 47) ||
2711 (backend_ctx->adreno_cl_compiler_version.type == DX && backend_ctx->adreno_cl_compiler_version.major >= 17);
2712 GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
2713 backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
2714
2715 size_t ext_str_size;
2716 clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
2717 char *ext_buffer = (char *)alloca(ext_str_size + 1);
2718 clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
2719 ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
2720 // Check if ext_buffer contains cl_khr_fp16
2721 backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
2722 GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
2723
2724 // fp16 is required
2725 if (!backend_ctx->fp16_support) {
2726 GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
2727 return nullptr;
2728 }
2729
2730 // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
2731 // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
2732 if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
2733 strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
2734 GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
2735 "(note that subgroups is an optional feature in OpenCL 3.0)\n");
2736 return nullptr;
2737 }
2738
2739 cl_uint base_align_in_bits;
2740 CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
2741 GGML_ASSERT(base_align_in_bits % 8u == 0);
2742 backend_ctx->alignment = base_align_in_bits / 8u;
2743 GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
2744
2745 clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
2746 GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
2747
2748 clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL);
2749 GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size);
2750
2751 clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
2752 GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
2753
2754 // Check SVM.
2755 cl_device_svm_capabilities svm_caps;
2756 CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0));
2757 GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n",
2758 svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false");
2759 GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n",
2760 svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false");
2761 GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n",
2762 svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false");
2763 GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
2764 svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
2765
2766 if (opencl_c_version.major >= 3) {
2767 // Assume it is not available for 3.0, since it is optional in 3.0.
2768 // If compiling against 3.0, then we can query.
2769 backend_ctx->non_uniform_workgroups = false;
2770#if CL_TARGET_OPENCL_VERSION >= 300
2771 CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
2772 &backend_ctx->non_uniform_workgroups, 0));
2773#endif
2774 } else {
2775 GGML_ASSERT(opencl_c_version.major == 2);
2776 // Non-uniform workgroup sizes is mandatory feature in v2.x.
2777 backend_ctx->non_uniform_workgroups = true;
2778 }
2779
2780 // Print out configurations
2781#ifdef GGML_OPENCL_SOA_Q
2782 GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
2783#endif // GGML_OPENCL_SOA_Q
2784
2785#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
2786 GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
2787#endif // GGML_OPENCL_USE_ADRENO_KERNELS
2788
2789 cl_int err;
2790
2791 // A local ref of cl_context for convenience
2792 cl_context context = backend_ctx->context = dev_ctx->context;
2793
2794 //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
2795 // (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
2796 // (queue = clCreateCommandQueue(context, device, 0, &err), err)
2797 //)));
2798 cl_command_queue_properties command_queue_props = 0;
2799#ifdef GGML_OPENCL_PROFILING
2800 command_queue_props |= CL_QUEUE_PROFILING_ENABLE;
2801#endif
2802 CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
2803
2804 // Load kernels
2805 load_cl_kernels(backend_ctx.get(), opencl_c_version);
2806
2807#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
2808 // Allocate intermediate buffers and images
2809 size_t required_A_q_d_bytes = 311164928;
2810 size_t required_A_s_d_bytes = 38895616;
2811 size_t required_B_d_bytes = 45088768;
2812
2813 // Ensure buffer sizes do not exceed the maximum allocation size
2814 size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, backend_ctx->max_alloc_size);
2815 size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, backend_ctx->max_alloc_size);
2816 size_t max_B_d_bytes = MIN(required_B_d_bytes, backend_ctx->max_alloc_size);
2817 if (required_A_q_d_bytes > backend_ctx->max_alloc_size) {
2818 GGML_LOG_WARN("ggml_opencl: A_q_d buffer size reduced from %zu to %zu due to device limitations.\n",
2819 required_A_q_d_bytes, max_A_q_d_bytes);
2820 }
2821 if (required_A_s_d_bytes > backend_ctx->max_alloc_size) {
2822 GGML_LOG_WARN("ggml_opencl: A_s_d buffer size reduced from %zu to %zu due to device limitations.\n",
2823 required_A_s_d_bytes, max_A_s_d_bytes);
2824 }
2825 if (required_B_d_bytes > backend_ctx->max_alloc_size) {
2826 GGML_LOG_WARN("ggml_opencl: B_d buffer size reduced from %zu to %zu due to device limitations.\n",
2827 required_B_d_bytes, max_B_d_bytes);
2828 }
2829
2830 backend_ctx->prealloc_quant_trans.allocate(context, max_A_q_d_bytes);
2831 backend_ctx->prealloc_scales_trans.allocate(context, max_A_s_d_bytes);
2832 backend_ctx->prealloc_act_trans.allocate(context, max_B_d_bytes);
2833#endif // GGML_OPENCL_USE_ADRENO_KERNELS
2834
2835 backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr;
2836
2837 dev_ctx->backend_ctx = backend_ctx.release();
2838 return dev_ctx->backend_ctx;
2839}
2840
2841static void ggml_cl2_free(ggml_backend_t backend) {
2842 ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
2843 ctx->free();
2844
2845 // The CL context is shared by all backends, release it if all backends have been released
2846 bool should_release_opencl = true;
2847 for (auto device : g_ggml_backend_opencl_devices) {
2848 ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
2849 if (ctx_dev->backend_ctx->ref_count > 0) {
2850 should_release_opencl = false;
2851 }
2852 }
2853
2854 if (should_release_opencl) {
2855 CL_CHECK(clReleaseContext(ctx->context));
2856 }
2857}
2858
2859//------------------------------------------------------------------------------
2860// Tensor extra management
2861//------------------------------------------------------------------------------
2862struct ggml_tensor_extra_cl {
2863 // The buffer object that holds the data.
2864 cl_mem data_device;
2865 // The offset into the buffer object. This is primarily for scratch buffer
2866 // and view operation.
2867 // NB: this offset no longer includes view offset (view_offs). Whenever this
2868 // offset is used, view_offs should be considered.
2869 cl_ulong offset;
2870 // The actual size of the cl_mem object. This is needed when returning the
2871 // block to the pool.
2872 size_t actual_size;
2873
2874 void reset() {
2875 data_device = nullptr;
2876 offset = 0;
2877 actual_size = 0;
2878 }
2879};
2880
2881// Additional tensor extra structs for quantized tensors.
2882// These tensors are loaded from files and should not be allocated in scratch --
2883// they should always be allocated from the pool. Hence, they do not have an
2884// `offset`, which indicate their locations in the scratch buffer.
2885struct ggml_tensor_extra_cl_q4_0 {
2886 // Quantized values.
2887 cl_mem q = nullptr;
2888 // Quantized values in image1d_buffer_t.
2889 cl_mem q_img = nullptr;
2890 // Scales.
2891 cl_mem d = nullptr;
2892 // Scales in image1d_buffer_t.
2893 cl_mem d_img = nullptr;
2894 // Size of quantized values.
2895 size_t size_q = 0;
2896 // Size of scales.
2897 size_t size_d = 0;
2898
2899 ~ggml_tensor_extra_cl_q4_0() {
2900 reset();
2901 }
2902
2903 void reset() {
2904 // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2905 // They must be properly released so that the original buffer can be
2906 // properly released to avoid memory leak.
2907 if (q != nullptr) {
2908 CL_CHECK(clReleaseMemObject(q));
2909 q = nullptr;
2910 }
2911 if (d != nullptr) {
2912 CL_CHECK(clReleaseMemObject(d));
2913 d = nullptr;
2914 }
2915 // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
2916 // enabled. They point to the images in ggml_backend_opencl_buffer_context.
2917 // So, there is no need to release them here.
2918 // TODO: initialize them for non SMALL_PATH path, or remove them.
2919 q_img = nullptr;
2920 d_img = nullptr;
2921 size_q = 0;
2922 size_d = 0;
2923 }
2924};
2925
2926struct ggml_tensor_extra_cl_mxfp4 {
2927 // Quantized values.
2928 cl_mem q = nullptr;
2929 // Quantized values in image1d_buffer_t.
2930 cl_mem q_img = nullptr;
2931 // Scales in E8M0.
2932 cl_mem e = nullptr;
2933 // Scales in image1d_buffer_t.
2934 cl_mem e_img = nullptr;
2935 // Size of quantized values.
2936 size_t size_q = 0;
2937 // Size of scales.
2938 size_t size_e = 0;
2939
2940 ~ggml_tensor_extra_cl_mxfp4() {
2941 reset();
2942 }
2943
2944 void reset() {
2945 // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2946 // They must be properly released so that the original buffer can be
2947 // properly released to avoid memory leak.
2948 if (q != nullptr) {
2949 CL_CHECK(clReleaseMemObject(q));
2950 q = nullptr;
2951 }
2952 if (e != nullptr) {
2953 CL_CHECK(clReleaseMemObject(e));
2954 e = nullptr;
2955 }
2956 if (q != nullptr) {
2957 CL_CHECK(clReleaseMemObject(q_img));
2958 q = nullptr;
2959 }
2960 // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2961 // that wraps around q and d to utilize image access path.
2962 q_img = nullptr;
2963 e_img = nullptr;
2964 size_q = 0;
2965 size_e = 0;
2966 }
2967};
2968
2969struct ggml_tensor_extra_cl_q8_0 {
2970 cl_mem q = nullptr;
2971 cl_mem q_img = nullptr;
2972
2973 cl_mem d = nullptr;
2974 cl_mem d_img = nullptr;
2975
2976 size_t size_q = 0;
2977 size_t size_d = 0;
2978
2979 ~ggml_tensor_extra_cl_q8_0() {
2980 reset();
2981 }
2982
2983 void reset() {
2984 // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
2985 // They must be properly released so that the original buffer can be
2986 // properly released to avoid memory leak.
2987 if (q != nullptr) {
2988 CL_CHECK(clReleaseMemObject(q));
2989 q = nullptr;
2990 }
2991 if (d != nullptr) {
2992 CL_CHECK(clReleaseMemObject(d));
2993 d = nullptr;
2994 }
2995 // Currently, q_img and d_img are not used. They can be image1d_buffer_t
2996 // that wraps around q and d to utilize image access path.
2997 q_img = nullptr;
2998 d_img = nullptr;
2999 size_q = 0;
3000 size_d = 0;
3001 }
3002};
3003
3004struct ggml_tensor_extra_cl_q6_K {
3005 // Lower 4 bits of quantized weights.
3006 cl_mem ql = nullptr;
3007 // Upper 2 bits of quantized weights.
3008 cl_mem qh = nullptr;
3009 // Scales for each block.
3010 cl_mem s = nullptr;
3011 // Scales for each super block.
3012 cl_mem d = nullptr;
3013
3014 size_t size_ql = 0;
3015 size_t size_qh = 0;
3016 size_t size_s = 0;
3017 size_t size_d = 0;
3018
3019 ~ggml_tensor_extra_cl_q6_K() {
3020 reset();
3021 }
3022
3023 void reset() {
3024 if (ql != nullptr) {
3025 CL_CHECK(clReleaseMemObject(ql));
3026 ql = nullptr;
3027 }
3028 if (qh != nullptr) {
3029 CL_CHECK(clReleaseMemObject(qh));
3030 qh = nullptr;
3031 }
3032 if (s != nullptr) {
3033 CL_CHECK(clReleaseMemObject(s));
3034 s = nullptr;
3035 }
3036 if (d != nullptr) {
3037 CL_CHECK(clReleaseMemObject(d));
3038 d = nullptr;
3039 }
3040
3041 size_ql = 0;
3042 size_qh = 0;
3043 size_s = 0;
3044 size_d = 0;
3045 }
3046};
3047
3048//------------------------------------------------------------------------------
3049// Backend API
3050//------------------------------------------------------------------------------
3051
3052//
3053// backend
3054//
3055static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
3056 return "OpenCL";
3057
3058 UNUSED(backend);
3059}
3060
3061static void ggml_backend_opencl_free(ggml_backend_t backend) {
3062 ggml_cl2_free(backend);
3063}
3064
3065static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
3066 GGML_UNUSED(backend);
3067 GGML_UNUSED(tensor);
3068 GGML_UNUSED(data);
3069 GGML_UNUSED(offset);
3070 GGML_UNUSED(size);
3071}
3072
3073static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
3074 GGML_UNUSED(backend);
3075 GGML_UNUSED(tensor);
3076 GGML_UNUSED(data);
3077 GGML_UNUSED(offset);
3078 GGML_UNUSED(size);
3079}
3080
3081static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
3082 GGML_UNUSED(backend);
3083 GGML_UNUSED(src);
3084 GGML_UNUSED(dst);
3085 return false;
3086}
3087
3088static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
3089 auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
3090
3091 cl_event evt;
3092 CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt));
3093 CL_CHECK(clWaitForEvents(1, &evt));
3094 CL_CHECK(clReleaseEvent(evt));
3095}
3096
3097// Syncronizes the 'backend_ctx's device with others so that commands
3098// enqueued to it won't start until commands in the other devices have
3099// completed.
3100static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
3101 if (g_ggml_backend_opencl_devices.size() < 2)
3102 return; // No other devices to synchronize with.
3103
3104 std::vector<cl_event> events;
3105 events.reserve(g_ggml_backend_opencl_devices.size());
3106
3107 for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
3108 auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
3109 if (backend_ctx != other_backend_ctx) {
3110 cl_event ev;
3111 CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
3112 CL_CHECK(clFlush(other_backend_ctx->queue));
3113 events.push_back(ev);
3114 }
3115 }
3116
3117 CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, events.size(), events.data(), nullptr));
3118 for (auto ev : events) {
3119 CL_CHECK(clReleaseEvent(ev));
3120 }
3121}
3122
3123static void sync_with_other_backends(ggml_backend_t backend) {
3124 auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
3125 sync_with_other_backends(backend_ctx);
3126}
3127
3128static bool ggml_opencl_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) {
3129 if (!ggml_can_fuse(cgraph, node_idx, ops)) {
3130 return false;
3131 }
3132
3133 if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) {
3134 const ggml_tensor *rms_norm = cgraph->nodes[node_idx];
3135 const ggml_tensor *mul = cgraph->nodes[node_idx+1];
3136
3137 GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32);
3138 GGML_ASSERT(rms_norm->type == GGML_TYPE_F32);
3139
3140 // rms_norm only supports f32
3141 if (mul->src[0]->type != GGML_TYPE_F32 ||
3142 mul->src[1]->type != GGML_TYPE_F32 ||
3143 mul->type != GGML_TYPE_F32) {
3144 return false;
3145 }
3146
3147 // if rms_norm is the B operand, then we don't handle broadcast
3148 if (rms_norm == mul->src[1] &&
3149 !ggml_are_same_shape(mul->src[0], rms_norm)) {
3150 return false;
3151 }
3152
3153 // rms_norm assumes contiguous rows
3154 if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
3155 return false;
3156 }
3157 } else if (ops.size() == 3 && ops.begin()[0] == GGML_OP_NORM && ops.begin()[1] == GGML_OP_MUL && ops.begin()[2] == GGML_OP_ADD) {
3158 const ggml_tensor *norm = cgraph->nodes[node_idx];
3159 const ggml_tensor *mul = cgraph->nodes[node_idx+1];
3160 const ggml_tensor *add = cgraph->nodes[node_idx+2];
3161 const ggml_tensor *w = mul->src[0] == norm ? mul->src[1] : mul->src[0];
3162 const ggml_tensor *b = add->src[0] == mul ? add->src[1] : add->src[0];
3163
3164 // norm fusion only supports F32
3165 if (norm->src[0]->type != GGML_TYPE_F32 || w->type != GGML_TYPE_F32 || b->type != GGML_TYPE_F32) {
3166 return false;
3167 }
3168
3169 if (norm->src[0]->ne[0] % 4 != 0) {
3170 return false;
3171 }
3172
3173 if (!ggml_is_contiguous(norm->src[0]) || !ggml_is_contiguous(w) || !ggml_is_contiguous(b)) {
3174 return false;
3175 }
3176 } else if (ops.size() == 3 && ops.begin()[0] == GGML_OP_GROUP_NORM && ops.begin()[1] == GGML_OP_MUL && ops.begin()[2] == GGML_OP_ADD) {
3177 const ggml_tensor *gn = cgraph->nodes[node_idx];
3178 const ggml_tensor *mul = cgraph->nodes[node_idx+1];
3179 const ggml_tensor *add = cgraph->nodes[node_idx+2];
3180 const ggml_tensor *w = mul->src[0] == gn ? mul->src[1] : mul->src[0];
3181 const ggml_tensor *b = add->src[0] == mul ? add->src[1] : add->src[0];
3182
3183 if (gn->src[0]->type != GGML_TYPE_F32 || w->type != GGML_TYPE_F32 || b->type != GGML_TYPE_F32) {
3184 return false;
3185 }
3186
3187 if (!ggml_is_contiguous(gn->src[0]) || !ggml_is_contiguous(w) || !ggml_is_contiguous(b)) {
3188 return false;
3189 }
3190 }
3191
3192 return true;
3193}
3194
3195static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor);
3196static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
3197static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
3198
3199static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
3200 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
3201
3202 for (int i = 0; i < cgraph->n_nodes; i++) {
3203 ggml_tensor * node = cgraph->nodes[i];
3204
3205 // NOTE: this may oversynchronize by synchronizing with
3206 // backends/devices which don't compute 'cgraph's
3207 // dependencies.
3208 sync_with_other_backends(backend);
3209
3210 if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
3211 continue;
3212 }
3213
3214 if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
3215 continue;
3216 }
3217
3218 if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_NORM, GGML_OP_MUL, GGML_OP_ADD })) {
3219 ggml_opencl_op_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
3220 i += 2;
3221 continue;
3222 }
3223 if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_GROUP_NORM, GGML_OP_MUL, GGML_OP_ADD })) {
3224 ggml_opencl_op_group_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
3225 i += 2;
3226 continue;
3227 }
3228 if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
3229 ggml_opencl_op_rms_norm_fused(backend, node, cgraph->nodes[i+1]);
3230 i++;
3231 continue;
3232 }
3233
3234 bool ok = ggml_cl_compute_forward(backend, node);
3235 if (!ok) {
3236 GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
3237 }
3238 GGML_ASSERT(ok);
3239 }
3240
3241 return GGML_STATUS_SUCCESS;
3242}
3243
3244static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
3245 ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
3246 ggml_backend_opencl_context * backend_ctx = dev_ctx->backend_ctx;
3247
3248 switch (op->op) {
3249 case GGML_OP_NONE:
3250 return true;
3251 case GGML_OP_GET_ROWS:
3252 switch (op->src[0]->type) {
3253 case GGML_TYPE_F32:
3254 case GGML_TYPE_F16:
3255 return true;
3256 case GGML_TYPE_Q4_0:
3257#ifdef GGML_OPENCL_SOA_Q
3258 // We do not support flattened Q4_0 (and possibly other Q's)
3259 return false;
3260#else // GGML_OPENCL_SOA_Q
3261 return true;
3262#endif // GGML_OPENCL_SOA_Q
3263 default:
3264 return false;
3265 }
3266 case GGML_OP_SET_ROWS:
3267 {
3268 // TODO: add support
3269 // ref: https://github.com/ggml-org/llama.cpp/pull/14274
3270#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
3271 if (op->src[0]->type != GGML_TYPE_F32) {
3272 return false;
3273 }
3274 switch (op->type) {
3275 case GGML_TYPE_F16:
3276 case GGML_TYPE_F32:
3277 return (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
3278 default:
3279 return false;
3280 }
3281 }
3282 case GGML_OP_CPY:
3283 case GGML_OP_DUP:
3284 case GGML_OP_CONT:
3285 switch (op->src[0]->type) {
3286 case GGML_TYPE_F32:
3287 switch (op->type) {
3288 case GGML_TYPE_F16:
3289 case GGML_TYPE_F32:
3290 return true;
3291 default:
3292 return false;
3293 }
3294 case GGML_TYPE_F16:
3295 switch (op->type) {
3296 case GGML_TYPE_F16:
3297 case GGML_TYPE_F32:
3298 return true;
3299 default:
3300 return false;
3301 }
3302 default:
3303 return false;
3304 }
3305 case GGML_OP_SCALE:
3306 return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
3307 case GGML_OP_ADD:
3308 if (op->type == GGML_TYPE_F16) {
3309 const bool src0_ok = op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32;
3310 const bool src1_ok = op->src[1]->type == GGML_TYPE_F16 || op->src[1]->type == GGML_TYPE_F32;
3311 if (src0_ok && src1_ok) {
3312 return true;
3313 }
3314 }
3315 case GGML_OP_MUL:
3316 case GGML_OP_DIV:
3317 case GGML_OP_SUB:
3318 return (op->src[0]->type == op->src[1]->type) &&
3319 (op->src[0]->type == op->type) &&
3320 (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
3321 case GGML_OP_ADD_ID:
3322 return op->src[0]->type == GGML_TYPE_F32;
3323 case GGML_OP_SQR:
3324 case GGML_OP_SQRT:
3325 return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
3326 ggml_is_contiguous(op->src[0]);
3327 case GGML_OP_UNARY:
3328 switch (ggml_get_unary_op(op)) {
3329 case GGML_UNARY_OP_GELU:
3330 case GGML_UNARY_OP_SILU:
3331 case GGML_UNARY_OP_RELU:
3332 case GGML_UNARY_OP_GELU_ERF:
3333 case GGML_UNARY_OP_GELU_QUICK:
3334 return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
3335 case GGML_UNARY_OP_SIGMOID:
3336 return ggml_is_contiguous(op->src[0]);
3337 case GGML_UNARY_OP_TANH:
3338 return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
3339 case GGML_UNARY_OP_EXPM1:
3340 return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
3341 (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
3342 case GGML_UNARY_OP_SOFTPLUS:
3343 return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
3344 (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
3345 default:
3346 return false;
3347 }
3348 case GGML_OP_GLU:
3349 switch (ggml_get_glu_op(op)) {
3350 case GGML_GLU_OP_GEGLU:
3351 case GGML_GLU_OP_REGLU:
3352 case GGML_GLU_OP_SWIGLU:
3353 case GGML_GLU_OP_SWIGLU_OAI:
3354 case GGML_GLU_OP_GEGLU_ERF:
3355 case GGML_GLU_OP_GEGLU_QUICK:
3356 return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
3357 default:
3358 return false;
3359 }
3360 case GGML_OP_TRI:
3361 return op->type == GGML_TYPE_F32 && ggml_is_contiguous(op);
3362 case GGML_OP_FILL:
3363 return op->type == GGML_TYPE_F32 && ggml_is_contiguous(op);
3364 case GGML_OP_CLAMP:
3365 return op->src[0]->type == GGML_TYPE_F32;
3366 case GGML_OP_SOFT_MAX:
3367 case GGML_OP_NORM:
3368 return true;
3369 case GGML_OP_RMS_NORM:
3370 return op->ne[0] % 4 == 0 && ggml_is_contiguous_rows(op->src[0]);
3371 case GGML_OP_REPEAT:
3372 return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
3373 case GGML_OP_PAD:
3374 // TODO: add circular padding support for opencl, see https://github.com/ggml-org/llama.cpp/pull/16985
3375 if (ggml_get_op_params_i32(op, 8) != 0) {
3376 return false;
3377 }
3378 return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
3379 case GGML_OP_UPSCALE: {
3380 ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF);
3381 const bool antialias = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & GGML_SCALE_FLAG_ANTIALIAS);
3382 return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
3383 (mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR) && !antialias;
3384 }
3385 case GGML_OP_CONV_2D:
3386 return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) ||
3387 (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
3388 (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
3389 case GGML_OP_SSM_CONV:
3390 return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
3391 case GGML_OP_CONCAT:
3392 return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
3393 case GGML_OP_TIMESTEP_EMBEDDING:
3394 return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
3395 case GGML_OP_GROUP_NORM:
3396 return ggml_is_contiguous(op->src[0]);
3397 case GGML_OP_MUL_MAT:
3398 if (op->src[0]->type == GGML_TYPE_F16) {
3399 return true;
3400 } else if (op->src[0]->type == GGML_TYPE_F32) {
3401 return op->src[1]->type == GGML_TYPE_F32;
3402 } else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_MXFP4 ||
3403 op->src[0]->type == GGML_TYPE_Q4_K ||
3404 op->src[0]->type == GGML_TYPE_Q6_K) {
3405 return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
3406 } else if (op->src[0]->type == GGML_TYPE_Q8_0) {
3407 return op->src[1]->type == GGML_TYPE_F32;
3408 }
3409 return false;
3410 case GGML_OP_MUL_MAT_ID:
3411 if (op->src[0]->type == GGML_TYPE_Q4_0 ||
3412 op->src[0]->type == GGML_TYPE_Q8_0 ||
3413 op->src[0]->type == GGML_TYPE_MXFP4) {
3414 if (op->src[1]->type == GGML_TYPE_F32) {
3415 return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
3416 }
3417 }
3418 return false;
3419 case GGML_OP_RESHAPE:
3420 case GGML_OP_VIEW:
3421 case GGML_OP_PERMUTE:
3422 case GGML_OP_TRANSPOSE:
3423 return true;
3424 case GGML_OP_DIAG_MASK_INF:
3425 return op->ne[3] == 1;
3426 case GGML_OP_ROPE: {
3427 const int mode = ((const int32_t *) op->op_params)[2];
3428 const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
3429 const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
3430 if (is_mrope && !is_vision) {
3431 if (op->src[0]->type == GGML_TYPE_F32 ||
3432 op->src[0]->type == GGML_TYPE_F16) {
3433 return true;
3434 }
3435 return false;
3436 }
3437 if (is_vision) {
3438 if (op->src[0]->type == GGML_TYPE_F32 ||
3439 op->src[0]->type == GGML_TYPE_F16) {
3440 return true;
3441 }
3442 return false;
3443 }
3444 return true;
3445 }
3446 case GGML_OP_SOLVE_TRI:
3447 return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
3448 case GGML_OP_IM2COL:
3449 return true;
3450 case GGML_OP_ARGSORT: {
3451 cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
3452 int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
3453
3454 int cols = 1;
3455 while (cols < op->ne[0]) {
3456 cols *= 2;
3457 }
3458
3459 return cols <= max_workgroup_size && op->src[0]->type == GGML_TYPE_F32;
3460 }
3461 case GGML_OP_SUM_ROWS:
3462 case GGML_OP_MEAN:
3463 return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
3464 case GGML_OP_FLASH_ATTN_EXT:
3465 {
3466 const ggml_tensor * q = op->src[0];
3467 const ggml_tensor * k = op->src[1];
3468 const ggml_tensor * v = op->src[2];
3469
3470 const int dk = q->ne[0];
3471 const int dv = v->ne[0];
3472
3473 const struct { int dk; int dv; } supported_dims[] = {
3474 { 40, 40}, { 64, 64}, { 80, 80}, { 96, 96},
3475 {112, 112}, {128, 128}, {192, 128},
3476 {192, 192}, {256, 256},
3477 };
3478
3479 bool dims_supported = false;
3480 for (size_t i = 0; i < sizeof(supported_dims)/sizeof(supported_dims[0]); ++i) {
3481 if (supported_dims[i].dk == dk && supported_dims[i].dv == dv) {
3482 dims_supported = true;
3483 break;
3484 }
3485 }
3486 if (!dims_supported) {
3487 return false;
3488 }
3489
3490 const bool is_f32_f32 = q->type == GGML_TYPE_F32 && k->type == GGML_TYPE_F32 &&
3491 v->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
3492 const bool is_f16_f16 = q->type == GGML_TYPE_F16 && k->type == GGML_TYPE_F16 &&
3493 v->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16;
3494 const bool is_f32_f16 = q->type == GGML_TYPE_F32 && k->type == GGML_TYPE_F16 &&
3495 v->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F32;
3496
3497 return is_f32_f32 || is_f16_f16 || is_f32_f16;
3498 }
3499 default:
3500 return false;
3501 }
3502}
3503
3504// Forward declaration - implementation appears later in the file.
3505static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type);
3506
3507static ggml_guid_t ggml_backend_opencl_guid() {
3508 static ggml_guid guid = { 0xde, 0xe0, 0x70, 0xa2, 0x73, 0x4e, 0x4d, 0xbc, 0xb0, 0xc7, 0x4f, 0xd4, 0x6d, 0x4e, 0x90, 0xfe };
3509 return &guid;
3510}
3511
3512static ggml_backend_i ggml_backend_opencl_i = {
3513 /* .get_name = */ ggml_backend_opencl_name,
3514 /* .free = */ ggml_backend_opencl_free,
3515 /* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
3516 /* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
3517 /* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
3518 /* .synchronize = */ ggml_backend_opencl_synchronize,
3519 /* .graph_plan_create = */ NULL,
3520 /* .graph_plan_free = */ NULL,
3521 /* .graph_plan_update = */ NULL,
3522 /* .graph_plan_compute = */ NULL,
3523 /* .graph_compute = */ ggml_backend_opencl_graph_compute,
3524 /* .event_record = */ NULL,
3525 /* .event_wait = */ NULL,
3526 /* .graph_optimize = */ NULL,
3527};
3528
3529ggml_backend_t ggml_backend_opencl_init(void) {
3530 ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0);
3531 ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
3532
3533 ggml_backend_t backend = new ggml_backend {
3534 /* .guid = */ ggml_backend_opencl_guid(),
3535 /* .iface = */ ggml_backend_opencl_i,
3536 /* .device = */ dev,
3537 /* .context = */ backend_ctx
3538 };
3539
3540 return backend;
3541}
3542
3543bool ggml_backend_is_opencl(ggml_backend_t backend) {
3544 return backend && backend->iface.get_name == ggml_backend_opencl_name;
3545}
3546
3547//
3548// buffer
3549//
3550struct ggml_backend_opencl_buffer_context {
3551 // A buffer context can hold multiple cl_mem objects. This is for flattening
3552 // quantized weights and should be used with GGML_OPENCL_SMALL_ALLOC where
3553 // each tensor is allocated a separate buffer. When flattening is enabled
3554 // with small allocation, each tensor is backed by two cl_mem objects (for
3555 // quants and scales) packed into a backend_opencl_buffer.
3556 ggml_backend_opencl_buffer_context(cl_mem buf)
3557 : name("OpenCL") {
3558 buffer.push_back(buf);
3559 }
3560
3561 ~ggml_backend_opencl_buffer_context() {
3562 for (cl_mem buf : buffer) {
3563 CL_CHECK(clReleaseMemObject(buf));
3564 }
3565 for (cl_mem im : img) {
3566 CL_CHECK(clReleaseMemObject(im));
3567 }
3568
3569 // Delete all extras to trigger their destructors
3570 for (ggml_tensor_extra_cl * e : temp_tensor_extras) {
3571 delete e;
3572 }
3573 for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
3574 delete e;
3575 }
3576 for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0) {
3577 delete e;
3578 }
3579 for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
3580 delete e;
3581 }
3582 for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4) {
3583 delete e;
3584 }
3585 for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
3586 delete e;
3587 }
3588 for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
3589 delete e;
3590 }
3591 for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3592 delete e;
3593 }
3594 for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K) {
3595 delete e;
3596 }
3597 for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
3598 delete e;
3599 }
3600 }
3601
3602 ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
3603 ggml_tensor_extra_cl * extra;
3604 if (temp_tensor_extras.empty()) {
3605 extra = new ggml_tensor_extra_cl();
3606 } else {
3607 extra = temp_tensor_extras.back();
3608 temp_tensor_extras.pop_back();
3609 }
3610
3611 temp_tensor_extras_in_use.push_back(extra);
3612
3613 extra->reset();
3614 return extra;
3615 }
3616
3617 ggml_tensor_extra_cl_q4_0 * ggml_opencl_alloc_temp_tensor_extra_q4_0() {
3618 ggml_tensor_extra_cl_q4_0 * extra;
3619 if (temp_tensor_extras_q4_0.empty()) {
3620 extra = new ggml_tensor_extra_cl_q4_0();
3621 } else {
3622 extra = temp_tensor_extras_q4_0.back();
3623 temp_tensor_extras_q4_0.pop_back();
3624 }
3625
3626 temp_tensor_extras_q4_0_in_use.push_back(extra);
3627
3628 extra->reset();
3629 return extra;
3630 }
3631
3632 ggml_tensor_extra_cl_mxfp4 * ggml_opencl_alloc_temp_tensor_extra_mxfp4() {
3633 ggml_tensor_extra_cl_mxfp4 * extra;
3634 if (temp_tensor_extras_mxfp4.empty()) {
3635 extra = new ggml_tensor_extra_cl_mxfp4();
3636 } else {
3637 extra = temp_tensor_extras_mxfp4.back();
3638 temp_tensor_extras_mxfp4.pop_back();
3639 }
3640
3641 temp_tensor_extras_mxfp4_in_use.push_back(extra);
3642
3643 extra->reset();
3644 return extra;
3645 }
3646
3647 ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0() {
3648 ggml_tensor_extra_cl_q8_0 * extra;
3649 if (temp_tensor_extras_q8_0.empty()) {
3650 extra = new ggml_tensor_extra_cl_q8_0();
3651 } else {
3652 extra = temp_tensor_extras_q8_0.back();
3653 temp_tensor_extras_q8_0.pop_back();
3654 }
3655
3656 temp_tensor_extras_q8_0_in_use.push_back(extra);
3657
3658 extra->reset();
3659 return extra;
3660 }
3661
3662 ggml_tensor_extra_cl_q6_K * ggml_opencl_alloc_temp_tensor_extra_q6_K() {
3663 ggml_tensor_extra_cl_q6_K * extra;
3664 if (temp_tensor_extras_q6_K.empty()) {
3665 extra = new ggml_tensor_extra_cl_q6_K();
3666 } else {
3667 extra = temp_tensor_extras_q6_K.back();
3668 temp_tensor_extras_q6_K.pop_back();
3669 }
3670
3671 temp_tensor_extras_q6_K_in_use.push_back(extra);
3672
3673 extra->reset();
3674 return extra;
3675 }
3676
3677 void reset() {
3678 for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
3679 temp_tensor_extras.push_back(e);
3680 }
3681 temp_tensor_extras_in_use.clear();
3682
3683 for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
3684 temp_tensor_extras_q4_0.push_back(e);
3685 }
3686 temp_tensor_extras_q4_0_in_use.clear();
3687
3688 for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
3689 temp_tensor_extras_mxfp4.push_back(e);
3690 }
3691 temp_tensor_extras_mxfp4_in_use.clear();
3692
3693 for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
3694 temp_tensor_extras_q8_0.push_back(e);
3695 }
3696 temp_tensor_extras_q8_0_in_use.clear();
3697
3698 for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
3699 temp_tensor_extras_q6_K.push_back(e);
3700 }
3701 temp_tensor_extras_q6_K_in_use.clear();
3702 }
3703
3704 // Pools for extras. Available extras are in `temp_tensor_extras`. Extras
3705 // being used are in `temp_tensor_extras_in_use`. At the first run, new
3706 // extras get created and put in `in_use`. When the buffer is reset via
3707 // the `reset` callback, all extras in `in_use` get moved to available extras
3708 // for reuse.
3709 std::vector<ggml_tensor_extra_cl *> temp_tensor_extras;
3710 std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
3711 std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
3712 std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
3713 std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
3714 std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
3715 std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
3716 std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
3717 std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K;
3718 std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K_in_use;
3719
3720 // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
3721 // before any tensor is initialized (at the beginning of alloc_tensor_range).
3722 // Hence, there is alway a buffer object in this vector. When each tensor is
3723 // being initialized, this original buffer object will be released if both
3724 // flattening and small allocation are enabled, and additional buffer
3725 // objects will be created in init_tensor to represent flattened quantized
3726 // weights.
3727 std::vector<cl_mem> buffer;
3728 // These are image1d_buffer_t objects that wrap around the quants and scales.
3729 // For Q4_0 quantization, there should be two of them - one for quants and
3730 // one for scales. They should be populated only when flattening and small
3731 // allocation are enabled.
3732 std::vector<cl_mem> img;
3733 std::string name;
3734};
3735
3736static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
3737 ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
3738 delete ctx;
3739}
3740
3741static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
3742 ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
3743 return (void *) (uintptr_t) backend_ctx->alignment;
3744}
3745
3746static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
3747 ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
3748
3749 ggml_cl2_init(buffer->buft->device);
3750
3751 if (tensor->view_src != nullptr) {
3752 GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
3753
3754 ggml_tensor_extra_cl * view_extra = (ggml_tensor_extra_cl *) tensor->view_src->extra;
3755 GGML_ASSERT(view_extra && "view_extra is nullptr?");
3756
3757 // Reuse extra of the parent tensor. The offset of this view tensor
3758 // becomes `extra->offset + view_offs` and needs to be calculated when
3759 // it is used. This changes is needed because of the change to
3760 // ggml_alloc.c in https://github.com/ggml-org/llama.cpp/pull/7640.
3761 // `buffer` passed in here will always be `tensor->buffer`. It is OK
3762 // to allocate extras from the same buffer context for ordinary
3763 // intermediate tensors. But for views into kv cache tensors, doing so
3764 // would mess up the extras used by kv cache.
3765 // Before #7640, `buffer` is for intermediate tensors, which is always
3766 // different from that of kv cache tensors.
3767 //
3768 // NB: now extra->offset no longer accounts for view_offs.
3769 // NB: this should not apply to weight tensors (for end-to-end runs, but
3770 // may apply for test-backend-ops).
3771 // FIXME: if any unexpected results are seen, double check the offset -
3772 // there could be other places that need fix.
3773 tensor->extra = view_extra;
3774 } else {
3775 {
3776 size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer);
3777
3778 ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
3779 extra->offset = offset;
3780 extra->data_device = ctx->buffer[0];
3781 extra->actual_size = ggml_nbytes(tensor);
3782
3783 tensor->extra = extra;
3784 }
3785 }
3786 return GGML_STATUS_SUCCESS;
3787}
3788
3789// The optimized gemm and gemv kernels are used for large matrices without batch.
3790// tensor is the quantized weights matrix.
3791inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
3792 int64_t threshold_ne0 = 512;
3793 int64_t threshold_ne1 = 512;
3794 if (!backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) &&
3795 backend_ctx->adreno_cl_compiler_version.type != DX) {
3796 threshold_ne0 = 128;
3797 threshold_ne1 = 128;
3798 }
3799 return tensor->ne[0] >= threshold_ne0 && tensor->ne[1] >= threshold_ne1 &&
3800 tensor->ne[2] == 1 && tensor->ne[3] == 1;
3801}
3802
3803inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
3804 GGML_UNUSED(backend_ctx);
3805 int ne01 = tensor->ne[1];
3806 return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
3807}
3808
3809inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
3810
3811 bool adreno_kernel = use_adreno_kernels(backend_ctx, tensor);
3812
3813 size_t elem_num = tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
3814
3815 return ((elem_num < 128 * 1024 * 1024) && adreno_kernel); // max element num: 2**27
3816}
3817
3818static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
3819 ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
3820
3821 cl_context context = backend_ctx->context;
3822 cl_command_queue queue = backend_ctx->queue;
3823
3824#ifdef GGML_OPENCL_SOA_Q
3825 // We separate the quantized bits and scale from block_q4_0 by using an
3826 // additional kernel, where each thread handles a block. We first read the
3827 // original weights into a temporary buffer, then create two separate
3828 // buffers for quantized bits and scales, which are then populated by the
3829 // conversion kernel.
3830 if (tensor->type == GGML_TYPE_Q4_0) {
3831 // Tensors should have been preallocated, therefore they should
3832 // already have ggml_tensor_extra_cl as extra.
3833 ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
3834 GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
3835
3836 // Allocate the new extra and create aliases from the original.
3837 ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
3838 ggml_tensor_extra_cl_q4_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_0();
3839
3840 size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
3841 size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
3842 GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
3843
3844 cl_int err;
3845 cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
3846 ggml_nbytes(tensor), NULL, &err);
3847 CL_CHECK(err);
3848 CL_CHECK(clEnqueueWriteBuffer(
3849 queue, data_device, CL_TRUE, 0,
3850 ggml_nbytes(tensor), data, 0, NULL, NULL));
3851
3852 // We consider the specified offset arg as always, although For weights
3853 // the offset arg should be 0 (we do not assert this).
3854 //GGML_ASSERT(offset == 0);
3855
3856 // We create subbuffers from the original tensor buffer for scales and
3857 // quants - i.e., scales and quants are aliases into the buffer obejct
3858 // that backs the original tensor. This is a cleaner way to adapt to the
3859 // new memory management.
3860 // In the old code, we allocate new buffers for scales and quants
3861 // respectively, which could still be done but would result in double
3862 // allocation; properly deallocating the preallocated buffer that backs
3863 // the tensors is tricky and would leak the backend specific information
3864 // into the general backend code.
3865 // Does this create misaligned subbuffers (alignment is 1024) in certain
3866 // cases ?
3867 cl_buffer_region region;
3868
3869 // The original tensor memory is divided into scales and quants, i.e.,
3870 // we first store scales, then quants.
3871 // Create subbuffer for scales.
3872 region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
3873 region.size = size_d;
3874 extra->d = clCreateSubBuffer(
3875 extra_orig->data_device, CL_MEM_READ_WRITE,
3876 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3877 CL_CHECK(err);
3878 auto previous_origin = region.origin;
3879
3880 // Create subbuffer for quants.
3881 region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
3882 region.size = size_q;
3883 extra->q = clCreateSubBuffer(
3884 extra_orig->data_device, CL_MEM_READ_WRITE,
3885 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
3886 CL_CHECK(err);
3887
3888 //cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
3889 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3890 cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
3891
3892 // The optimized kernels need weights in natural order, so unshuffle.
3893 if (use_adreno_kernels(backend_ctx, tensor)) {
3894 kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
3895 }
3896 #else
3897 cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
3898 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
3899 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
3900 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
3901 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
3902
3903 size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
3904 size_t local_work_size[] = {64, 1, 1};
3905
3906 cl_event evt;
3907 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3908 CL_CHECK(clWaitForEvents(1, &evt));
3909 CL_CHECK(clReleaseMemObject(data_device));
3910
3911 tensor->extra = extra;
3912
3913 // transpose the weights and scales
3914 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
3915 // Only do transpose for large, non batched matrix
3916 // TODO: use preallocated images instead of sub-buffer then image
3917 if (use_adreno_kernels(backend_ctx, tensor)) {
3918 // <----------------------------------------------------------------------------------> //
3919 // start transpose
3920 // <----------------------------------------------------------------------------------> //
3921 int M = tensor->ne[1]; // ne01
3922 int K = tensor->ne[0]; // ne00
3923
3924 //For matrix-vector multiplication kernel, we assume K is a multiple of 32
3925 GGML_ASSERT(K % 32 == 0);
3926 //For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
3927 GGML_ASSERT(M % 4 == 0);
3928
3929 // transpose is out of place, so we need to allocate transposed buffers
3930 // <----------------------------------------------------------------------------------> //
3931 // use sub_buffer of max buffer size instead
3932
3933 size_t q_size_bytes = K * M / 8 * sizeof(float);
3934 backend_ctx->prealloc_quant_trans.allocate(context, q_size_bytes);
3935
3936 cl_buffer_region region;
3937 region.origin = 0;
3938 region.size = q_size_bytes;
3939 cl_mem qT_d = clCreateSubBuffer(
3940 backend_ctx->prealloc_quant_trans.buffer,
3941 0,
3942 CL_BUFFER_CREATE_TYPE_REGION,
3943 ®ion,
3944 &err);
3945 CL_CHECK(err);
3946
3947 bool K_tile_trans = true;
3948 if ((K / 32) % 4 != 0){
3949 K_tile_trans =false;
3950 }
3951
3952 size_t d_size_bytes = M * (K / 32) * 2;
3953 backend_ctx->prealloc_scales_trans.allocate(context, d_size_bytes);
3954
3955 region.origin = 0;
3956 region.size = d_size_bytes;
3957 cl_mem dT_d = clCreateSubBuffer(
3958 backend_ctx->prealloc_scales_trans.buffer,
3959 0,
3960 CL_BUFFER_CREATE_TYPE_REGION,
3961 ®ion,
3962 &err);
3963 CL_CHECK(err);
3964
3965 // <----------------------------------------------------------------------------------> //
3966
3967
3968 // create images from the buffers
3969 // <----------------------------------------------------------------------------------> //
3970 cl_mem q_d_image1D;
3971 cl_mem d_d_image1D;
3972 cl_mem qT_d_image1D;
3973 cl_mem dT_d_image1D;
3974
3975 cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
3976 cl_image_desc img_desc_1d;
3977
3978 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
3979 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
3980 img_desc_1d.image_width = M * K / 4 / 4;
3981 img_desc_1d.buffer = extra->q;
3982 q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
3983 CL_CHECK(err);
3984
3985 img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
3986 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
3987 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
3988 img_desc_1d.image_width = M * K / 4 / 4;
3989 img_desc_1d.buffer = qT_d;
3990 qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
3991 CL_CHECK(err);
3992
3993 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
3994 if (K_tile_trans) {
3995 img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
3996 img_desc_1d.image_width = M * K / 32 / 4;
3997 } else {
3998 img_fmt_1d = { CL_R, CL_HALF_FLOAT };
3999 img_desc_1d.image_width = M * K / 32;
4000 }
4001 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
4002 img_desc_1d.buffer = extra->d;
4003 d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
4004 CL_CHECK(err);
4005
4006 img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
4007 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
4008 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
4009 img_desc_1d.image_width = M * K / 32 / 4;
4010 img_desc_1d.buffer = dT_d;
4011 dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
4012 CL_CHECK(err);
4013 // <----------------------------------------------------------------------------------> //
4014
4015 // set up and call the transpose kernels
4016 // <----------------------------------------------------------------------------------> //
4017 // weights
4018 int height_q = M / 4;
4019 int width_q = K / 4 / 4;
4020 kernel = backend_ctx->kernel_transpose_16;
4021
4022 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
4023 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
4024 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
4025 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
4026
4027 size_t local_size_q[3] = {4, 16, 1};
4028 size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
4029 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
4030 CL_CHECK(clWaitForEvents(1, &evt));
4031
4032 // scales
4033 int height_s = M / 4;
4034 int width_s = K / 32 / 4;
4035
4036 kernel = backend_ctx->kernel_transpose_16;
4037 if (!K_tile_trans) {
4038 kernel = backend_ctx->kernel_transpose_16_4x1;
4039 width_s = K / 32;
4040 }
4041 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
4042 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
4043 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
4044 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
4045
4046 size_t local_size_s[3] = {4, 16, 1};
4047 size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
4048 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
4049 CL_CHECK(clWaitForEvents(1, &evt));
4050 // <----------------------------------------------------------------------------------> //
4051
4052 // copy transposed buffer contents to original buffers
4053 // <----------------------------------------------------------------------------------> //
4054 // weights
4055 CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
4056 CL_CHECK(clWaitForEvents(1, &evt));
4057
4058 // scales
4059 CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
4060 CL_CHECK(clWaitForEvents(1, &evt));
4061 // <----------------------------------------------------------------------------------> //
4062
4063 // deallocate transpose buffers
4064 // <----------------------------------------------------------------------------------> //
4065 CL_CHECK(clReleaseMemObject(qT_d));
4066 CL_CHECK(clReleaseMemObject(dT_d));
4067
4068 // deallocate temporary images
4069 CL_CHECK(clReleaseMemObject(q_d_image1D));
4070 CL_CHECK(clReleaseMemObject(d_d_image1D));
4071 CL_CHECK(clReleaseMemObject(qT_d_image1D));
4072 CL_CHECK(clReleaseMemObject(dT_d_image1D));
4073 // <----------------------------------------------------------------------------------> //
4074 // end transpose
4075 // <----------------------------------------------------------------------------------> //
4076 }
4077 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
4078
4079 return;
4080
4081 }
4082 if (tensor->type == GGML_TYPE_MXFP4) {
4083 ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
4084 GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
4085
4086 // Allocate the new extra and create aliases from the original.
4087 ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
4088 ggml_tensor_extra_cl_mxfp4 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_mxfp4();
4089
4090 size_t size_e = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(char);
4091 size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
4092 GGML_ASSERT(size_e + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
4093
4094 cl_int err;
4095 cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
4096 ggml_nbytes(tensor), NULL, &err);
4097 CL_CHECK(err);
4098 CL_CHECK(clEnqueueWriteBuffer(
4099 queue, data_device, CL_TRUE, 0,
4100 ggml_nbytes(tensor), data, 0, NULL, NULL));
4101
4102 // The original tensor memory is divided into scales and quants, i.e.,
4103 // we first store scales, then quants.
4104 cl_buffer_region region;
4105
4106 // Create subbuffer for scales.
4107 region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
4108 region.size = size_e;
4109 extra->e = clCreateSubBuffer(
4110 extra_orig->data_device, CL_MEM_READ_WRITE,
4111 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
4112 CL_CHECK(err);
4113 auto previous_origin = region.origin;
4114
4115 // Create subbuffer for quants.
4116 region.origin = align_to(previous_origin + size_e, backend_ctx->alignment);
4117 region.size = size_q;
4118 extra->q = clCreateSubBuffer(
4119 extra_orig->data_device, CL_MEM_READ_WRITE,
4120 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
4121 CL_CHECK(err);
4122
4123#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
4124 if (use_adreno_moe_kernels(backend_ctx, tensor)) {
4125 cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans;
4126
4127 int ne00 = tensor->ne[0];
4128 int ne01 = tensor->ne[1];
4129 int ne02 = tensor->ne[2];
4130 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
4131 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
4132 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
4133 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
4134 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
4135
4136 size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
4137 size_t local_work_size[3] = {64, 2, 1};
4138
4139 cl_event evt;
4140 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4141 CL_CHECK(clWaitForEvents(1, &evt));
4142 CL_CHECK(clReleaseMemObject(data_device));
4143 tensor->extra = extra;
4144
4145 return;
4146 }
4147#endif
4148 cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4;
4149
4150 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
4151 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
4152 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
4153
4154 size_t global_work_size[3] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4155 size_t local_work_size[3] = {64, 1, 1};
4156
4157 cl_event evt;
4158 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4159 CL_CHECK(clWaitForEvents(1, &evt));
4160 CL_CHECK(clReleaseMemObject(data_device));
4161
4162 // Create image for Q
4163 cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32};
4164 cl_image_desc img_desc_q = {
4165 CL_MEM_OBJECT_IMAGE1D_BUFFER,
4166 static_cast<size_t>(ggml_nelements(tensor)/32*2),
4167 0, 0, 0, 0, 0, 0, 0,
4168 { extra->q }
4169 };
4170 extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
4171 tensor->extra = extra;
4172
4173 return;
4174 }
4175 if (tensor->type == GGML_TYPE_Q8_0) {
4176 ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
4177 GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
4178
4179 // Allocate the new extra and create aliases from the original.
4180 ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
4181 ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0();
4182
4183 size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
4184 size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)*sizeof(char));
4185 GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
4186
4187 cl_int err;
4188 cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
4189 ggml_nbytes(tensor), NULL, &err);
4190 CL_CHECK(err);
4191 CL_CHECK(clEnqueueWriteBuffer(
4192 queue, data_device, CL_TRUE, 0,
4193 ggml_nbytes(tensor), data, 0, NULL, NULL));
4194
4195 // The original tensor memory is divided into scales and quants, i.e.,
4196 // we first store scales, then quants.
4197 cl_buffer_region region;
4198
4199 // Create subbuffer for scales.
4200 region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
4201 region.size = size_d;
4202 extra->d = clCreateSubBuffer(
4203 extra_orig->data_device, CL_MEM_READ_WRITE,
4204 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
4205 CL_CHECK(err);
4206 auto previous_origin = region.origin;
4207
4208 // Create subbuffer for quants.
4209 region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
4210 region.size = size_q;
4211 extra->q = clCreateSubBuffer(
4212 extra_orig->data_device, CL_MEM_READ_WRITE,
4213 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
4214 CL_CHECK(err);
4215
4216 cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0;
4217
4218 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
4219 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
4220 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
4221
4222 size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4223 size_t local_work_size[] = {64, 1, 1};
4224
4225 cl_event evt;
4226 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4227 CL_CHECK(clWaitForEvents(1, &evt));
4228 CL_CHECK(clReleaseMemObject(data_device));
4229
4230 tensor->extra = extra;
4231
4232 // Transpose the weights and scales
4233#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
4234 if (enable_adreno_trans_weight(backend_ctx, tensor)) {
4235
4236 int M = tensor->ne[1]; // ne01
4237 int K = tensor->ne[0]; // ne00
4238
4239 GGML_ASSERT(K % 32 == 0);
4240 GGML_ASSERT(M % 4 == 0);
4241 GGML_ASSERT(tensor->ne[2] == 1);
4242 GGML_ASSERT(tensor->ne[3] == 1);
4243
4244 // Transpose weights
4245 size_t q_size_bytes = K * M / 4 * sizeof(float);
4246 cl_buffer_region region;
4247 region.origin = 0;
4248 region.size = q_size_bytes;
4249 cl_mem qT_d = clCreateSubBuffer(
4250 backend_ctx->prealloc_quant_trans.buffer,
4251 0,
4252 CL_BUFFER_CREATE_TYPE_REGION,
4253 ®ion,
4254 &err);
4255 CL_CHECK(err);
4256
4257 cl_mem q_d_image1D;
4258 cl_mem qT_d_image1D;
4259
4260 cl_image_format img_fmt_1d;
4261 cl_image_desc img_desc_1d;
4262
4263 img_fmt_1d = { CL_RGBA, CL_FLOAT };
4264 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
4265 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
4266 img_desc_1d.image_width = M * K / 4 / 4;
4267 img_desc_1d.buffer = extra->q;
4268 q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
4269 CL_CHECK(err);
4270
4271 img_fmt_1d = { CL_RGBA, CL_FLOAT };
4272 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
4273 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
4274 img_desc_1d.image_width = M * K / 4 / 4;
4275 img_desc_1d.buffer = qT_d;
4276 qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
4277 CL_CHECK(err);
4278
4279 int height_q = M / 4;
4280 int width_q = K / 4 / 4;
4281 kernel = backend_ctx->kernel_transpose_32;
4282
4283 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
4284 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
4285 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q));
4286 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q));
4287
4288 size_t local_size_q[3] = {4, 16, 1};
4289 size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
4290 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
4291 CL_CHECK(clWaitForEvents(1, &evt));
4292
4293 // Transpose scales
4294 size_t d_size_bytes = M * (K / 32) * 2;
4295 region.origin = 0;
4296 region.size = d_size_bytes;
4297 cl_mem dT_d = clCreateSubBuffer(
4298 backend_ctx->prealloc_scales_trans.buffer,
4299 0,
4300 CL_BUFFER_CREATE_TYPE_REGION,
4301 ®ion,
4302 &err);
4303 CL_CHECK(err);
4304
4305 cl_mem d_d_image1D;
4306 cl_mem dT_d_image1D;
4307
4308 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
4309 img_fmt_1d = { CL_R, CL_HALF_FLOAT };
4310 img_desc_1d.image_width = M * K / 32;
4311 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
4312 img_desc_1d.buffer = extra->d;
4313 d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
4314 CL_CHECK(err);
4315
4316 img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
4317 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
4318 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
4319 img_desc_1d.image_width = M * K / 32 / 4;
4320 img_desc_1d.buffer = dT_d;
4321 dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
4322 CL_CHECK(err);
4323
4324 int height_s = M / 4;
4325 int width_s = K / 32;
4326
4327 kernel = backend_ctx->kernel_transpose_16_4x1;
4328
4329 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
4330 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
4331 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
4332 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
4333
4334 size_t local_size_s[3] = {4, 16, 1};
4335 size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
4336 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
4337 CL_CHECK(clWaitForEvents(1, &evt));
4338
4339 // copy transposed buffer contents to original buffers
4340 CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
4341 CL_CHECK(clWaitForEvents(1, &evt));
4342
4343 CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
4344 CL_CHECK(clWaitForEvents(1, &evt));
4345
4346 CL_CHECK(clReleaseMemObject(qT_d));
4347 CL_CHECK(clReleaseMemObject(dT_d));
4348
4349 CL_CHECK(clReleaseMemObject(q_d_image1D));
4350 CL_CHECK(clReleaseMemObject(d_d_image1D));
4351 CL_CHECK(clReleaseMemObject(qT_d_image1D));
4352 CL_CHECK(clReleaseMemObject(dT_d_image1D));
4353 } // end transpose
4354#endif // GGML_OPENCL_USE_ADRENO_KERNELS
4355
4356 return;
4357 }
4358 if (tensor->type == GGML_TYPE_Q6_K) {
4359 ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
4360 GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
4361
4362 // Allocate the new extra and create aliases from the original.
4363 ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
4364 ggml_tensor_extra_cl_q6_K * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q6_K();
4365
4366 size_t size_ql = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
4367 size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/4;
4368 size_t size_s = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/16;
4369 size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
4370 GGML_ASSERT(size_ql + size_qh + size_s + size_d == ggml_nbytes(tensor) &&
4371 "Incorrect tensor size");
4372
4373 cl_int err;
4374 cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
4375 ggml_nbytes(tensor), NULL, &err);
4376 CL_CHECK(err);
4377 CL_CHECK(clEnqueueWriteBuffer(
4378 queue, data_device, CL_TRUE, 0,
4379 ggml_nbytes(tensor), data, 0, NULL, NULL));
4380
4381 cl_buffer_region region;
4382
4383 // Subbuffer for ql
4384 region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
4385 region.size = size_ql;
4386 extra->ql = clCreateSubBuffer(
4387 extra_orig->data_device, CL_MEM_READ_WRITE,
4388 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
4389 CL_CHECK(err);
4390 auto previous_origin = region.origin;
4391
4392 // Subbuffer for qh
4393 region.origin = align_to(previous_origin + size_ql, backend_ctx->alignment);
4394 region.size = size_qh;
4395 extra->qh = clCreateSubBuffer(
4396 extra_orig->data_device, CL_MEM_READ_WRITE,
4397 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
4398 CL_CHECK(err);
4399 previous_origin = region.origin;
4400
4401 // Subbuffer for scales
4402 region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment);
4403 region.size = size_s;
4404 extra->s = clCreateSubBuffer(
4405 extra_orig->data_device, CL_MEM_READ_WRITE,
4406 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
4407 CL_CHECK(err);
4408 previous_origin = region.origin;
4409
4410 // Create subbuffer for d.
4411 region.origin = align_to(previous_origin + size_s, backend_ctx->alignment);
4412 region.size = size_d;
4413 extra->d = clCreateSubBuffer(
4414 extra_orig->data_device, CL_MEM_READ_WRITE,
4415 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
4416 CL_CHECK(err);
4417 previous_origin = region.origin;
4418
4419 // Flatten the weights
4420 cl_kernel kernel = backend_ctx->kernel_convert_block_q6_K;
4421
4422 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
4423 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql));
4424 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
4425 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
4426 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d));
4427
4428 size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4429 size_t local_work_size[] = {64, 1, 1};
4430
4431 cl_event evt;
4432 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4433 CL_CHECK(clWaitForEvents(1, &evt));
4434 CL_CHECK(clReleaseMemObject(data_device));
4435
4436 extra->size_ql = size_ql;
4437 extra->size_qh = size_qh;
4438 extra->size_s = size_s;
4439 extra->size_d = size_d;
4440
4441 tensor->extra = extra;
4442 return;
4443 }
4444#endif // GGML_OPENCL_SOA_Q
4445
4446 ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
4447 GGML_ASSERT(extra);
4448
4449 CL_CHECK(clEnqueueWriteBuffer(
4450 queue, extra->data_device, CL_TRUE, extra->offset + offset,
4451 size, data, 0, NULL, NULL));
4452
4453 GGML_UNUSED(buffer);
4454}
4455
4456static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
4457 GGML_ASSERT(tensor->extra);
4458
4459 ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
4460
4461 cl_context context = backend_ctx->context;
4462 cl_command_queue queue = backend_ctx->queue;
4463
4464 // Make sure all previously submitted commands in other devices are finished.
4465 sync_with_other_backends(backend_ctx);
4466
4467#ifdef GGML_OPENCL_SOA_Q
4468 // In end-to-end runs, get_tensor is usually used to get back the logits,
4469 // where we can simply do clEnqueueReadBuffer since they are f32.
4470 // However, in test-backend-ops, the GPU graph is copied to the CPU backend,
4471 // which requires reading back quantized weight tensors.
4472 // To properly support this, we need to restore block_q4_0 struct arrays
4473 // from the flattened buffers.
4474 if (tensor->type == GGML_TYPE_Q4_0) {
4475 ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
4476
4477#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
4478 if (use_adreno_kernels(backend_ctx, tensor)) {
4479 cl_int err;
4480 cl_kernel kernel;
4481
4482 cl_int M = tensor->ne[1]; // ne01
4483 cl_int K = tensor->ne[0]; // ne00
4484
4485 GGML_ASSERT(K % 32 == 0);
4486 GGML_ASSERT(M % 4 == 0);
4487
4488 size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
4489 size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
4490 GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
4491
4492 cl_mem buf_trans_q;
4493 cl_mem buf_trans_d;
4494
4495 CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
4496 size_q, NULL, &err), err));
4497 CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
4498 size_d, NULL, &err), err));
4499
4500 kernel = backend_ctx->kernel_transpose_16_buf;
4501
4502 // transpose q back
4503 cl_int stride_k_q = K/4;
4504 size_t local_size_q[3] = {64, 1, 1};
4505 size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
4506
4507 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
4508 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
4509 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
4510 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
4511
4512 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
4513 global_size_q, local_size_q, 0, NULL, NULL));
4514
4515 // transpose scales back
4516 cl_int stride_k_d = K/32;
4517 size_t local_size_d[3] = {64, 1, 1};
4518 size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
4519
4520 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
4521 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
4522 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
4523 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
4524
4525 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
4526 global_size_d, local_size_d, 0, NULL, NULL));
4527
4528 // unpack
4529 cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
4530 ggml_nbytes(tensor), NULL, &err);
4531 CL_CHECK(err);
4532
4533 cl_uchar mask_0F = 0x0F;
4534 cl_uchar mask_F0 = 0xF0;
4535
4536 size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4537 size_t local_work_size[] = {1, 1, 1};
4538
4539 kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
4540 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_q));
4541 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
4542 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
4543 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
4544 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
4545
4546 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
4547 global_work_size, local_work_size, 0, NULL, NULL));
4548
4549 // read back to host
4550 CL_CHECK(clEnqueueReadBuffer(
4551 queue, data_device, CL_TRUE, offset,
4552 size, data, 0, NULL, NULL));
4553
4554 CL_CHECK(clReleaseMemObject(data_device));
4555 CL_CHECK(clReleaseMemObject(buf_trans_q));
4556 CL_CHECK(clReleaseMemObject(buf_trans_d));
4557
4558 return;
4559 }
4560#endif
4561
4562 cl_int err;
4563 cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
4564 ggml_nbytes(tensor), NULL, &err);
4565 CL_CHECK(err);
4566
4567 cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0;
4568 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
4569 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
4570 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
4571
4572 size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4573 size_t local_work_size[] = {1, 1, 1};
4574
4575 cl_event evt;
4576 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
4577 global_work_size, local_work_size, 0, NULL, &evt));
4578 CL_CHECK(clWaitForEvents(1, &evt));
4579 CL_CHECK(clEnqueueReadBuffer(
4580 queue, data_device, CL_TRUE, offset,
4581 size, data, 0, NULL, NULL));
4582 CL_CHECK(clReleaseMemObject(data_device));
4583 return;
4584 } else if (tensor->type == GGML_TYPE_MXFP4) {
4585 ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *)tensor->extra;
4586
4587 cl_int err;
4588 cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
4589 ggml_nbytes(tensor), NULL, &err);
4590 CL_CHECK(err);
4591
4592#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
4593 if (use_adreno_moe_kernels(backend_ctx, tensor)) {
4594 cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4_trans;
4595
4596 int ne00 = tensor->ne[0];
4597 int ne01 = tensor->ne[1];
4598 int ne02 = tensor->ne[2];
4599 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
4600 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
4601 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
4602 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
4603 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
4604
4605 size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
4606 size_t local_work_size[3] = {64, 2, 1};
4607
4608 cl_event evt;
4609 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
4610 global_work_size, local_work_size, 0, NULL, &evt));
4611 CL_CHECK(clWaitForEvents(1, &evt));
4612 CL_CHECK(clEnqueueReadBuffer(
4613 queue, data_device, CL_TRUE, offset,
4614 size, data, 0, NULL, NULL));
4615 CL_CHECK(clReleaseMemObject(data_device));
4616 return;
4617 }
4618#endif
4619 cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4;
4620 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
4621 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
4622 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
4623
4624 size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4625 size_t local_work_size[] = {1, 1, 1};
4626
4627 cl_event evt;
4628 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
4629 global_work_size, local_work_size, 0, NULL, &evt));
4630 CL_CHECK(clWaitForEvents(1, &evt));
4631 CL_CHECK(clEnqueueReadBuffer(
4632 queue, data_device, CL_TRUE, offset,
4633 size, data, 0, NULL, NULL));
4634 CL_CHECK(clReleaseMemObject(data_device));
4635 return;
4636 }
4637 if (tensor->type == GGML_TYPE_Q8_0) {
4638 ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra;
4639
4640 cl_int err;
4641 cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
4642 ggml_nbytes(tensor), NULL, &err);
4643 CL_CHECK(err);
4644
4645#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
4646 if (enable_adreno_trans_weight(backend_ctx, tensor)) {
4647 cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0_trans;
4648
4649 int ne00 = tensor->ne[0];
4650 int ne01 = tensor->ne[1];
4651 GGML_ASSERT(tensor->ne[2] == 1); // ???
4652 GGML_ASSERT(tensor->ne[3] == 1); // ???
4653
4654 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
4655 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
4656 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
4657 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
4658 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
4659
4660 size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), 1, 1};
4661 size_t local_work_size[3] = {64, 1, 1};
4662
4663 cl_event evt;
4664 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
4665 global_work_size, local_work_size, 0, NULL, &evt));
4666 CL_CHECK(clWaitForEvents(1, &evt));
4667
4668 CL_CHECK(clEnqueueReadBuffer(
4669 queue, data_device, CL_TRUE, offset,
4670 size, data, 0, NULL, NULL));
4671 CL_CHECK(clReleaseMemObject(data_device));
4672 return;
4673 }
4674#endif
4675 cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
4676 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
4677 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
4678 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
4679
4680 size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4681 size_t local_work_size[] = {1, 1, 1};
4682
4683 cl_event evt;
4684 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
4685 global_work_size, local_work_size, 0, NULL, &evt));
4686 CL_CHECK(clWaitForEvents(1, &evt));
4687 CL_CHECK(clEnqueueReadBuffer(
4688 queue, data_device, CL_TRUE, offset,
4689 size, data, 0, NULL, NULL));
4690 CL_CHECK(clReleaseMemObject(data_device));
4691 return;
4692 }
4693 if (tensor->type == GGML_TYPE_Q6_K) {
4694 ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
4695
4696 cl_int err;
4697 cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
4698 ggml_nbytes(tensor), NULL, &err);
4699 CL_CHECK(err);
4700
4701 cl_kernel kernel = backend_ctx->kernel_restore_block_q6_K;
4702 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql));
4703 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
4704 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
4705 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
4706 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
4707
4708 size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
4709 size_t local_work_size[] = {1, 1, 1};
4710
4711 cl_event evt;
4712 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
4713 global_work_size, local_work_size, 0, NULL, &evt));
4714 CL_CHECK(clWaitForEvents(1, &evt));
4715 CL_CHECK(clEnqueueReadBuffer(
4716 queue, data_device, CL_TRUE, offset,
4717 size, data, 0, NULL, NULL));
4718 CL_CHECK(clReleaseMemObject(data_device));
4719 return;
4720 }
4721#endif // GGML_OPENCL_SOA_Q
4722
4723 ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
4724
4725 CL_CHECK(clEnqueueReadBuffer(
4726 queue, extra->data_device, CL_TRUE, extra->offset + tensor->view_offs + offset,
4727 size, data, 0, NULL, NULL));
4728
4729 GGML_UNUSED(buffer);
4730}
4731
4732static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
4733 ggml_backend_dev_t dev = buffer->buft->device;
4734 ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
4735 cl_command_queue queue = backend_ctx->queue;
4736
4737 ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
4738 for (cl_mem buf : ctx->buffer) {
4739 CL_CHECK(clEnqueueFillBuffer(queue, buf, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
4740 }
4741 CL_CHECK(clFinish(queue));
4742}
4743
4744static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
4745 ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
4746 ctx->reset();
4747}
4748
4749static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
4750 /* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer,
4751 /* .get_base = */ ggml_backend_opencl_buffer_get_base,
4752 /* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor,
4753 /* .memset_tensor = */ NULL,
4754 /* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor,
4755 /* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor,
4756 /* .cpy_tensor = */ NULL,
4757 /* .clear = */ ggml_backend_opencl_buffer_clear,
4758 /* .reset = */ ggml_backend_opencl_buffer_reset,
4759};
4760
4761//
4762// buffer type
4763//
4764
4765static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type) {
4766 return "OpenCL";
4767
4768 GGML_UNUSED(buffer_type);
4769}
4770
4771static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
4772 ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device);
4773
4774 // clCreateBuffer returns -61 for size 0
4775 size = std::max(size, (size_t)1);
4776
4777 cl_int err;
4778 cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err);
4779 if (err != CL_SUCCESS) {
4780 GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
4781 return nullptr;
4782 }
4783
4784 ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context(mem);
4785
4786 return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
4787}
4788
4789static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
4790 ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
4791 return backend_ctx->alignment;
4792}
4793
4794static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
4795 static size_t max_size = -1;
4796 if (max_size == (size_t)-1) {
4797 ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
4798 max_size = backend_ctx->max_alloc_size;
4799 }
4800 return max_size;
4801}
4802
4803static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
4804 return ggml_backend_is_opencl(backend);
4805
4806 UNUSED(buft);
4807}
4808
4809static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
4810 /* .get_name = */ ggml_backend_opencl_buffer_type_get_name,
4811 /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
4812 /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
4813 /* .get_max_size = */ ggml_backend_opencl_buffer_type_get_max_size,
4814 /* .get_alloc_size = */ NULL,
4815 /* .is_host = */ NULL,
4816};
4817
4818//
4819// backend device
4820//
4821
4822static const char * ggml_backend_opencl_device_get_name(ggml_backend_dev_t dev) {
4823 return "GPUOpenCL";
4824
4825 GGML_UNUSED(dev);
4826}
4827
4828static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_t dev) {
4829 ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
4830 return dev_ctx->device_name.c_str();
4831}
4832
4833static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
4834 *free = 0;
4835 *total = 0;
4836
4837 GGML_UNUSED(dev);
4838}
4839
4840static enum ggml_backend_dev_type ggml_backend_opencl_device_get_type(ggml_backend_dev_t dev) {
4841 return GGML_BACKEND_DEVICE_TYPE_GPU;
4842
4843 GGML_UNUSED(dev);
4844}
4845
4846static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
4847 props->name = ggml_backend_opencl_device_get_name(dev);
4848 props->description = ggml_backend_opencl_device_get_description(dev);
4849 props->type = ggml_backend_opencl_device_get_type(dev);
4850 ggml_backend_opencl_device_get_memory(dev, &props->memory_free, &props->memory_total);
4851 props->caps = ggml_backend_dev_caps {
4852 /* .async = */ false,
4853 /* .host_buffer = */ false,
4854 /* .buffer_from_host_ptr = */ false,
4855 /* .events = */ false,
4856 };
4857}
4858
4859static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
4860 ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
4861 // Getting a new reference to the backend, increase ref_count
4862 backend_ctx->ref_count++;
4863
4864 ggml_backend_t backend = new ggml_backend {
4865 /* .guid = */ ggml_backend_opencl_guid(),
4866 /* .interface = */ ggml_backend_opencl_i,
4867 /* .device = */ dev,
4868 /* .context = */ backend_ctx,
4869 };
4870
4871 return backend;
4872
4873 GGML_UNUSED(params);
4874}
4875
4876static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
4877 auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(dev->context);
4878
4879 dev_ctx->buffer_type = ggml_backend_buffer_type{
4880 /* .iface = */ ggml_backend_opencl_buffer_type_interface,
4881 /* .device = */ dev,
4882 /* .context = */ nullptr,
4883 };
4884
4885 return &dev_ctx->buffer_type;
4886}
4887
4888static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
4889 GGML_UNUSED(dev);
4890 GGML_UNUSED(ptr);
4891 GGML_UNUSED(size);
4892 GGML_UNUSED(max_tensor_size);
4893 return nullptr;
4894}
4895
4896static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
4897 return ggml_opencl_supports_op(dev, op);
4898}
4899
4900static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
4901 // Check 'dev' and 'buffer_type' are not objects belonging to this backend.
4902 if (dev->iface.get_name != ggml_backend_opencl_device_get_name ||
4903 buft->iface.get_name != ggml_backend_opencl_buffer_type_get_name) {
4904 return false;
4905 }
4906
4907 // Check cl_context is the same. clEnqueue* commands may not use
4908 // buffers from another cl_context.
4909 ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
4910 ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
4911 return backend_ctx0->context == backend_ctx1->context;
4912}
4913
4914namespace /* anonymous */ {
4915struct ggml_backend_device_i ggml_backend_opencl_device_i = {
4916 /* .get_name = */ ggml_backend_opencl_device_get_name,
4917 /* .get_description = */ ggml_backend_opencl_device_get_description,
4918 /* .get_memory = */ ggml_backend_opencl_device_get_memory,
4919 /* .get_type = */ ggml_backend_opencl_device_get_type,
4920 /* .get_props = */ ggml_backend_opencl_device_get_props,
4921 /* .init_backend = */ ggml_backend_opencl_device_init,
4922 /* .get_buffer_type = */ ggml_backend_opencl_device_get_buffer_type,
4923 /* .get_host_buffer_type = */ NULL,
4924 /* .buffer_from_host_ptr = */ ggml_backend_opencl_device_buffer_from_ptr,
4925 /* .supports_op = */ ggml_backend_opencl_device_supports_op,
4926 /* .supports_buft = */ ggml_backend_opencl_device_supports_buft,
4927 /* .offload_op = */ NULL,
4928 /* .event_new = */ NULL,
4929 /* .event_free = */ NULL,
4930 /* .event_synchronize = */ NULL,
4931};
4932}
4933
4934// Backend registry
4935
4936static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
4937 return "OpenCL";
4938
4939 GGML_UNUSED(reg);
4940}
4941
4942static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
4943 return g_ggml_backend_opencl_devices.size();
4944
4945 GGML_UNUSED(reg);
4946}
4947
4948static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
4949 GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg));
4950
4951 return &g_ggml_backend_opencl_devices[index];
4952
4953 GGML_UNUSED(reg);
4954 GGML_UNUSED(index);
4955}
4956
4957static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
4958 /* .get_name = */ ggml_backend_opencl_reg_get_name,
4959 /* .device_count = */ ggml_backend_opencl_reg_device_count,
4960 /* .device_get = */ ggml_backend_opencl_reg_device_get,
4961 /* .get_proc_address = */ NULL,
4962};
4963
4964ggml_backend_reg_t ggml_backend_opencl_reg(void) {
4965 static std::mutex mutex;
4966 static ggml_backend_reg reg;
4967 static bool initialized = false;
4968 std::lock_guard<std::mutex> lock(mutex);
4969
4970 if (initialized) {
4971 return ®
4972 }
4973 initialized = true;
4974
4975 g_ggml_backend_opencl_devices = ggml_opencl_probe_devices(®);
4976
4977 reg = ggml_backend_reg{
4978 /* .api_version = */ GGML_BACKEND_API_VERSION,
4979 /* .iface = */ ggml_backend_opencl_reg_i,
4980 /* .context = */ NULL,
4981 };
4982
4983 return ®
4984}
4985
4986GGML_BACKEND_DL_IMPL(ggml_backend_opencl_reg)
4987
4988//------------------------------------------------------------------------------
4989// Debugging utils
4990//------------------------------------------------------------------------------
4991#if 0
4992#define QK4_0 32
4993typedef struct {
4994 ggml_fp16_t d; // delta
4995 uint8_t qs[QK4_0 / 2]; // nibbles / quants
4996} block_q4_0;
4997static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2,
4998 "wrong q4_0 block size/padding");
4999
5000#include <math.h>
5001#ifdef __cplusplus
5002#include "half.hpp"
5003#endif
5004
5005static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tensor) {
5006 void * buf = malloc(ggml_nbytes(tensor));
5007
5008 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5009 cl_command_queue queue = backend_ctx->queue;
5010#ifdef GGML_OPENCL_SOA_Q
5011 void * buf_q;
5012 void * buf_d;
5013#endif
5014
5015 // Make sure everything is done.
5016 CL_CHECK(clFinish(queue));
5017
5018#ifdef GGML_OPENCL_SOA_Q
5019 if (tensor->type == GGML_TYPE_Q4_0) {
5020 ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *) tensor->extra;
5021 GGML_ASSERT(extra);
5022
5023 size_t size_q = ggml_nelements(tensor)/QK4_0 * QK4_0/2;
5024 size_t size_d = ggml_nelements(tensor)/QK4_0 * sizeof(ggml_fp16_t);
5025 GGML_ASSERT(size_q + size_d == ggml_nbytes(tensor));
5026 buf_q = malloc(size_q);
5027 buf_d = malloc(size_d);
5028
5029 CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
5030 CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_d, buf_d, 0, NULL, NULL));
5031 CL_CHECK(clFinish(queue));
5032 } else if (tensor->type == GGML_TYPE_MXFP4) {
5033 ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *) tensor->extra;
5034 GGML_ASSERT(extra);
5035
5036 size_t size_q = ggml_nelements(tensor)/QK_MXFP4 * QK_MXFP4/2;
5037 size_t size_e = ggml_nelements(tensor)/QK_MXFP4 * sizeof(char);
5038 GGML_ASSERT(size_q + size_e == ggml_nbytes(tensor));
5039 buf_q = malloc(size_q);
5040 buf_d = malloc(size_e);
5041
5042 CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
5043 CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_e, buf_d, 0, NULL, NULL));
5044 CL_CHECK(clFinish(queue));
5045 } else {
5046 // Read out the tensor from GPU memory.
5047 ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
5048 GGML_ASSERT(extra);
5049
5050 CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
5051 extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
5052 CL_CHECK(clFinish(queue));
5053 }
5054#else
5055 // Read out the tensor from GPU memory.
5056 ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
5057 GGML_ASSERT(extra);
5058
5059 CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
5060 extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
5061 CL_CHECK(clFinish(queue));
5062#endif // GGML_OPENCL_SOA_Q
5063
5064 // Open file and dump.
5065 char fname[512];
5066 snprintf(fname, sizeof(fname), "./tensor-dumps/%s.txt", tensor->name);
5067 FILE * f = fopen(fname, "w");
5068 if (!f) {
5069 printf("Failed to open %s\n", fname);
5070 return;
5071 }
5072
5073 if (tensor->type == GGML_TYPE_F32) {
5074 float * data = (float *) buf;
5075 for (int i = 0; i < ggml_nelements(tensor); ++i) {
5076 if (isnan(data[i])) {
5077 printf("NaN found: %s\n", tensor->name);
5078 break;
5079 }
5080 fprintf(f, "%f\n", data[i]);
5081 }
5082 } else if (tensor->type == GGML_TYPE_I32) {
5083 int * data = (int *) buf;
5084 for (int i = 0; i < ggml_nelements(tensor); ++i) {
5085 if (isnan(data[i])) {
5086 printf("NaN found: %s\n", tensor->name);
5087 break;
5088 }
5089 fprintf(f, "%d\n", data[i]);
5090 }
5091 } else if (tensor->type == GGML_TYPE_F16) {
5092#ifdef __cplusplus
5093 half_float::half * data = (half_float::half *) buf;
5094 for (int i = 0; i < ggml_nelements(tensor); ++i) {
5095 if (std::isnan(data[i])) {
5096 printf("NaN found: %s\n", tensor->name);
5097 break;
5098 }
5099 fprintf(f, "%f\n", float(data[i]));
5100 }
5101#endif
5102 } else if (tensor->type == GGML_TYPE_Q4_0) {
5103#ifdef GGML_OPENCL_SOA_Q
5104 ggml_fp16_t * data_d = (ggml_fp16_t *)buf_d;
5105 unsigned char * data_q = (unsigned char *)buf_q;
5106
5107 for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
5108 fprintf(f, "%04x, ", data_d[i]);
5109 for (int k = 0; k < QK4_0/2; ++k) {
5110 fprintf(f, "%02x, ", data_q[k]);
5111 }
5112 fprintf(f, "\n");
5113 data_q += QK4_0/2;
5114 }
5115 free(buf_d);
5116 free(buf_q);
5117#else
5118 block_q4_0 * data = (block_q4_0 *) buf;
5119 for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
5120 fprintf(f, "%04x, ", data[i].d);
5121 for (int k = 0; k < QK4_0/2; ++k) {
5122 fprintf(f, "%02x, ", data[i].qs[k]);
5123 }
5124 fprintf(f, "\n");
5125 }
5126#endif // GGML_OPENCL_SOA_Q
5127 }
5128 free(buf);
5129 fflush(f);
5130 fclose(f);
5131}
5132#else
5133#define dump_tensor(tensor)
5134#endif
5135
5136//------------------------------------------------------------------------------
5137// Ops
5138//------------------------------------------------------------------------------
5139
5140static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
5141 const int64_t ne10 = src1->ne[0];
5142
5143 const int64_t ne0 = dst->ne[0];
5144 const int64_t ne1 = dst->ne[1];
5145
5146 // TODO: find the optimal values for these
5147 return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
5148 src1->type == GGML_TYPE_F32 &&
5149 dst->type == GGML_TYPE_F32 &&
5150 (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
5151}
5152
5153// Copy a noncontiguous tensor to contiguous tensor. ne[] remains the same but
5154// nb[] is recalculated such that tensor is contiguous.
5155static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor * src, cl_mem dst,
5156 cl_ulong &nb0, cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) {
5157 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5158
5159 const int tensor_type_size = ggml_type_size(src->type);
5160
5161 const int ne00 = src->ne[0];
5162 const int ne01 = src->ne[1];
5163 const int ne02 = src->ne[2];
5164 const int ne03 = src->ne[3];
5165
5166 const cl_ulong nb00 = src->nb[0];
5167 const cl_ulong nb01 = src->nb[1];
5168 const cl_ulong nb02 = src->nb[2];
5169 const cl_ulong nb03 = src->nb[3];
5170
5171 const int ne0 = src->ne[0];
5172 const int ne1 = src->ne[1];
5173 const int ne2 = src->ne[2];
5174 const int ne3 = src->ne[3];
5175
5176 nb0 = tensor_type_size;
5177 nb1 = tensor_type_size*ne00;
5178 nb2 = tensor_type_size*ne00*ne01;
5179 nb3 = tensor_type_size*ne00*ne01*ne02;
5180
5181 ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *)src->extra;
5182
5183 cl_ulong offset0 = extra->offset + src->view_offs;
5184 cl_ulong offsetd = 0;
5185
5186 cl_kernel kernel;
5187
5188 switch (src->type) {
5189 case GGML_TYPE_F32:
5190 kernel = backend_ctx->kernel_cpy_f32_f32;
5191 break;
5192 case GGML_TYPE_F16:
5193 kernel = backend_ctx->kernel_cpy_f16_f16;
5194 break;
5195 default:
5196 GGML_ASSERT(false && "not implemented");
5197 }
5198
5199 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->data_device));
5200 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5201 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst));
5202 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
5203 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
5204 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
5205 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
5206 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
5207 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
5208 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
5209 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
5210 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
5211 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
5212 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1));
5213 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne2));
5214 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne3));
5215 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb0));
5216 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1));
5217 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2));
5218 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3));
5219
5220 const int nth = MIN(64, ne00);
5221
5222 size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5223 size_t local_work_size[] = {(size_t)nth, 1, 1};
5224
5225 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src);
5226}
5227
5228static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5229 UNUSED(backend);
5230 UNUSED(src0);
5231 UNUSED(src1);
5232 UNUSED(dst);
5233}
5234
5235static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5236 GGML_ASSERT(src0);
5237 GGML_ASSERT(src0->extra);
5238 GGML_ASSERT(src1);
5239 GGML_ASSERT(src1->extra);
5240 GGML_ASSERT(dst);
5241 GGML_ASSERT(dst->extra);
5242
5243 const int ne00 = src0->ne[0];
5244 const cl_ulong nb01 = src0->nb[1];
5245 const cl_ulong nb02 = src0->nb[2];
5246 const cl_ulong nb03 = src0->nb[3];
5247 const int ne10 = src1->ne[0];
5248 const cl_ulong nb10 = src1->nb[0];
5249 const int ne11 = src1->ne[1];
5250 const int ne12 = src1->ne[2];
5251 const cl_ulong nb11 = src1->nb[1];
5252 const cl_ulong nb12 = src1->nb[2];
5253 const cl_ulong nb1 = dst->nb[1];
5254 const cl_ulong nb2 = dst->nb[2];
5255 const cl_ulong nb3 = dst->nb[3];
5256
5257 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5258
5259 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5260 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5261 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5262
5263 cl_ulong offset0 = extra0->offset + src0->view_offs;
5264 cl_ulong offset1 = extra1->offset + src1->view_offs;
5265 cl_ulong offsetd = extrad->offset + dst->view_offs;
5266
5267 cl_kernel kernel;
5268
5269 switch (src0->type) {
5270 case GGML_TYPE_F32:
5271 kernel = backend_ctx->kernel_get_rows_f32;
5272 break;
5273 case GGML_TYPE_F16:
5274 kernel = backend_ctx->kernel_get_rows_f16;
5275 break;
5276 case GGML_TYPE_Q4_0:
5277 kernel = backend_ctx->kernel_get_rows_q4_0;
5278 break;
5279 default:
5280 GGML_ASSERT(false && "not implemented");
5281 }
5282
5283 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5284 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5285 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5286 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5287 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
5288 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
5289 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
5290 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
5291 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
5292 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
5293 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
5294 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb10));
5295 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
5296 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
5297 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
5298 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
5299 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
5300
5301 size_t global_work_size[] = {(size_t)ne10*64, (size_t)ne11, (size_t)ne12};
5302 size_t local_work_size[] = {64, 1, 1};
5303
5304 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5305}
5306
5307static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5308 GGML_ASSERT(src0);
5309 GGML_ASSERT(src0->extra);
5310 GGML_ASSERT(src1);
5311 GGML_ASSERT(src1->extra);
5312 GGML_ASSERT(dst);
5313 GGML_ASSERT(dst->extra);
5314 GGML_ASSERT(src1->type == GGML_TYPE_I64 || src1->type == GGML_TYPE_I32);
5315
5316 // ne0 = ne00
5317 // ne2 = ne02
5318 // ne3 = ne03
5319
5320 const int ne01 = src0->ne[1];
5321 const int ne02 = src0->ne[2];
5322 const int ne03 = src0->ne[3];
5323
5324 const cl_ulong nb01 = src0->nb[1];
5325 const cl_ulong nb02 = src0->nb[2];
5326 const cl_ulong nb03 = src0->nb[3];
5327
5328 const int ne11 = src1->ne[1];
5329 const int ne12 = src1->ne[2];
5330
5331 const cl_ulong nb10 = src1->nb[0];
5332 const cl_ulong nb11 = src1->nb[1];
5333 const cl_ulong nb12 = src1->nb[2];
5334
5335 const int ne0 = dst->ne[0];
5336
5337 const cl_ulong nb1 = dst->nb[1];
5338 const cl_ulong nb2 = dst->nb[2];
5339 const cl_ulong nb3 = dst->nb[3];
5340
5341 const int nblk0 = ne0/ggml_blck_size(dst->type);
5342
5343 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5344
5345 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5346 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5347 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5348
5349 cl_ulong offset0 = extra0->offset + src0->view_offs;
5350 cl_ulong offset1 = extra1->offset + src1->view_offs;
5351 cl_ulong offsetd = extrad->offset + dst->view_offs;
5352
5353 cl_kernel kernel;
5354
5355 switch (dst->type) {
5356 case GGML_TYPE_F32:
5357 if (src1->type == GGML_TYPE_I64) {
5358 kernel = backend_ctx->kernel_set_rows_f32_i64;
5359 } else {
5360 kernel = backend_ctx->kernel_set_rows_f32_i32;
5361 }
5362 break;
5363 case GGML_TYPE_F16:
5364 if (src1->type == GGML_TYPE_I64) {
5365 kernel = backend_ctx->kernel_set_rows_f16_i64;
5366 } else {
5367 kernel = backend_ctx->kernel_set_rows_f16_i32;
5368 }
5369 break;
5370 default:
5371 GGML_ABORT("not implemented");
5372 }
5373
5374 fastdiv_vals ne11_ = init_fastdiv_values(ne11);
5375 fastdiv_vals ne12_ = init_fastdiv_values(ne12);
5376
5377 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5378 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5379 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5380 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5381 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
5382 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
5383 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
5384 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
5385 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
5386 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
5387 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne11_));
5388 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(fastdiv_vals), &ne12_));
5389 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
5390 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
5391 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
5392 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &nblk0));
5393 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb1));
5394 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb2));
5395 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb3));
5396
5397 int nth0 = 64;
5398 if (backend_ctx->gpu_family == INTEL) {
5399 nth0 = 32;
5400 } else if (backend_ctx->gpu_family == ADRENO) {
5401 nth0 = 64;
5402 }
5403
5404 int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
5405 while (nth0 < nblk0 && nth0 < max_workgroup_size) {
5406 nth0 *= 2;
5407 }
5408
5409 int rows_per_workgroup = 1;
5410 if (nth0 > nblk0) {
5411 rows_per_workgroup = nth0 / nblk0;
5412 nth0 = nblk0;
5413 }
5414
5415 size_t global_work_size[] = {
5416 (size_t)(ne01 + rows_per_workgroup - 1)/rows_per_workgroup*nth0,
5417 (size_t)ne02*rows_per_workgroup,
5418 (size_t)ne03};
5419 size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1};
5420
5421 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5422}
5423
5424static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5425 GGML_ASSERT(src0);
5426 GGML_ASSERT(src0->extra);
5427 GGML_ASSERT(src1);
5428 GGML_ASSERT(src1->extra);
5429 GGML_ASSERT(dst);
5430 GGML_ASSERT(dst->extra);
5431
5432 const int ne00 = src0->ne[0];
5433 const int ne01 = src0->ne[1];
5434 const int ne02 = src0->ne[2];
5435 const int ne03 = src0->ne[3];
5436
5437 const cl_ulong nb00 = src0->nb[0];
5438 const cl_ulong nb01 = src0->nb[1];
5439 const cl_ulong nb02 = src0->nb[2];
5440 const cl_ulong nb03 = src0->nb[3];
5441
5442 const int ne10 = src1->ne[0];
5443 const int ne11 = src1->ne[1];
5444 const int ne12 = src1->ne[2];
5445 const int ne13 = src1->ne[3];
5446
5447 const cl_ulong nb10 = src1->nb[0];
5448 const cl_ulong nb11 = src1->nb[1];
5449 const cl_ulong nb12 = src1->nb[2];
5450 const cl_ulong nb13 = src1->nb[3];
5451
5452 const int ne0 = dst->ne[0];
5453 const int ne1 = dst->ne[1];
5454 const int ne2 = dst->ne[2];
5455 const int ne3 = dst->ne[3];
5456
5457 const cl_ulong nb0 = dst->nb[0];
5458 const cl_ulong nb1 = dst->nb[1];
5459 const cl_ulong nb2 = dst->nb[2];
5460 const cl_ulong nb3 = dst->nb[3];
5461
5462 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5463
5464 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5465 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5466 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5467
5468 cl_ulong offset0 = extra0->offset + src0->view_offs;
5469 cl_ulong offset1 = extra1->offset + src1->view_offs;
5470 cl_ulong offsetd = extrad->offset + dst->view_offs;
5471
5472 cl_kernel kernel;
5473
5474 const bool bcast_row = ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0;
5475
5476 if (bcast_row) {
5477 GGML_ASSERT(ggml_is_contiguous(src0));
5478 GGML_ASSERT(ne11 == 1);
5479 }
5480
5481 if (dst->type == GGML_TYPE_F32) {
5482 GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32);
5483 if (bcast_row) {
5484 kernel = backend_ctx->kernel_add_row;
5485 const int ne = ne00 / 4;
5486 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5487 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5488 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5489 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5490 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
5491 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
5492 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
5493 } else {
5494 kernel = backend_ctx->kernel_add;
5495 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5496 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5497 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5498 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5499 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
5500 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
5501 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
5502 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
5503 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
5504 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
5505 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
5506 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
5507 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
5508 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
5509 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
5510 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
5511 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
5512 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
5513 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
5514 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
5515 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
5516 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
5517 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
5518 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
5519 CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
5520 CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
5521 CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
5522 CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
5523 CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
5524 CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
5525 }
5526 } else if (dst->type == GGML_TYPE_F16) {
5527 GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
5528 GGML_ASSERT(src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
5529 const int type_src0 = (src0->type == GGML_TYPE_F32);
5530 const int type_src1 = (src1->type == GGML_TYPE_F32);
5531 if (bcast_row) {
5532 kernel = backend_ctx->kernel_add_row_f16;
5533 const int ne = ne00 / 4;
5534 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5535 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5536 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5537 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5538 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
5539 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
5540 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
5541 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &type_src0));
5542 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &type_src1));
5543 } else {
5544 kernel = backend_ctx->kernel_add_f16;
5545 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5546 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5547 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5548 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5549 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
5550 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
5551 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
5552 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
5553 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
5554 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
5555 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
5556 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
5557 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
5558 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
5559 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
5560 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
5561 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
5562 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
5563 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
5564 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
5565 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
5566 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
5567 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
5568 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
5569 CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
5570 CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
5571 CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
5572 CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
5573 CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
5574 CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
5575 CL_CHECK(clSetKernelArg(kernel, 30, sizeof(int), &type_src0));
5576 CL_CHECK(clSetKernelArg(kernel, 31, sizeof(int), &type_src1));
5577 }
5578 } else {
5579 GGML_ASSERT(false && "unsupported data types for add");
5580 }
5581
5582 if (bcast_row) {
5583 int n = ggml_nelements(dst)/4;
5584 size_t global_work_size[] = {(size_t)n, 1, 1};
5585 size_t local_work_size[] = {64, 1, 1};
5586
5587 size_t * local_work_size_ptr = local_work_size;
5588 if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
5589 local_work_size_ptr = nullptr;
5590 }
5591
5592 backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size_ptr, dst);
5593 } else {
5594 unsigned int nth = MIN(64, ne0);
5595 size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
5596 size_t local_work_size[] = {nth, 1, 1};
5597
5598 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5599 }
5600}
5601
5602static void ggml_cl_add_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5603 GGML_ASSERT(src0);
5604 GGML_ASSERT(src0->extra);
5605 GGML_ASSERT(src1);
5606 GGML_ASSERT(src1->extra);
5607 GGML_ASSERT(dst);
5608 GGML_ASSERT(dst->extra);
5609
5610 const ggml_tensor * src2 = dst->src[2];
5611 GGML_ASSERT(src2);
5612 GGML_ASSERT(src2->extra);
5613
5614 GGML_ASSERT(src0->type == GGML_TYPE_F32);
5615 GGML_ASSERT(src1->type == GGML_TYPE_F32);
5616 GGML_ASSERT(src2->type == GGML_TYPE_I32);
5617 GGML_ASSERT(dst->type == GGML_TYPE_F32);
5618
5619 GGML_ASSERT(ggml_is_contiguous_rows(src0));
5620
5621 const int ne00 = src0->ne[0];
5622 const int ne01 = src0->ne[1];
5623 const int ne02 = src0->ne[2];
5624
5625 const cl_ulong nb01 = src0->nb[1];
5626 const cl_ulong nb02 = src0->nb[2];
5627
5628 const cl_ulong nb11 = src1->nb[1];
5629
5630 const cl_ulong nb21 = src2->nb[1];
5631
5632 const int ne0 = dst->ne[0];
5633 const int ne1 = dst->ne[1];
5634
5635 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5636
5637 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5638 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5639 ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
5640 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5641
5642 cl_ulong offset0 = extra0->offset + src0->view_offs;
5643 cl_ulong offset1 = extra1->offset + src1->view_offs;
5644 cl_ulong offset2 = extra2->offset + src2->view_offs;
5645 cl_ulong offsetd = extrad->offset + dst->view_offs;
5646
5647 cl_kernel kernel = backend_ctx->kernel_add_id;
5648
5649 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5650 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5651 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5652 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5653 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
5654 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
5655 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
5656 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
5657 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
5658 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
5659 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb11));
5660 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb21));
5661 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
5662 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1));
5663
5664 int nth = MIN(ne00, (int) backend_ctx->get_kernel_workgroup_size(kernel));
5665 size_t global_work_size[] = { (size_t)ne01*nth, (size_t)ne02, 1 };
5666 size_t local_work_size[] = { (size_t)nth, 1, 1 };
5667
5668 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5669}
5670
5671static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5672 GGML_ASSERT(src0);
5673 GGML_ASSERT(src0->extra);
5674 GGML_ASSERT(src1);
5675 GGML_ASSERT(src1->extra);
5676 GGML_ASSERT(dst);
5677 GGML_ASSERT(dst->extra);
5678
5679 GGML_ASSERT(src0->type == src1->type);
5680 GGML_ASSERT(src0->type == dst->type);
5681 GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
5682
5683 const int ne00 = src0->ne[0];
5684 const int ne01 = src0->ne[1];
5685 const int ne02 = src0->ne[2];
5686 const int ne03 = src0->ne[3];
5687
5688 const cl_ulong nb00 = src0->nb[0];
5689 const cl_ulong nb01 = src0->nb[1];
5690 const cl_ulong nb02 = src0->nb[2];
5691 const cl_ulong nb03 = src0->nb[3];
5692
5693 const int ne10 = src1->ne[0];
5694 const int ne11 = src1->ne[1];
5695 const int ne12 = src1->ne[2];
5696 const int ne13 = src1->ne[3]; UNUSED(ne13);
5697
5698 const cl_ulong nb10 = src1->nb[0];
5699 const cl_ulong nb11 = src1->nb[1];
5700 const cl_ulong nb12 = src1->nb[2];
5701 const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
5702
5703 const int ne0 = dst->ne[0];
5704 const int ne1 = dst->ne[1];
5705 const int ne2 = dst->ne[2];
5706 const int ne3 = dst->ne[3];
5707
5708 const cl_ulong nb0 = dst->nb[0];
5709 const cl_ulong nb1 = dst->nb[1];
5710 const cl_ulong nb2 = dst->nb[2];
5711 const cl_ulong nb3 = dst->nb[3];
5712
5713 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5714
5715 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5716 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5717 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5718
5719 cl_ulong offset0 = extra0->offset + src0->view_offs;
5720 cl_ulong offset1 = extra1->offset + src1->view_offs;
5721 cl_ulong offsetd = extrad->offset + dst->view_offs;
5722
5723 bool bcast_row = false;
5724 cl_kernel kernel;
5725
5726 if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
5727 GGML_ASSERT(ggml_is_contiguous(src0));
5728
5729 // src1 is a row
5730 GGML_ASSERT(ne11 == 1);
5731
5732 bcast_row = true;
5733 int ne = ne00 / 4;
5734
5735 if (src0->type == GGML_TYPE_F32) {
5736 kernel = backend_ctx->kernel_mul_row;
5737 } else {
5738 kernel = backend_ctx->kernel_mul_row_f16;
5739 }
5740
5741 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5742 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5743 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5744 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5745 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
5746 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
5747 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
5748 } else {
5749 if (src0->type == GGML_TYPE_F32) {
5750 kernel = backend_ctx->kernel_mul;
5751 } else {
5752 kernel = backend_ctx->kernel_mul_f16;
5753 }
5754
5755 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5756 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5757 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5758 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5759 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
5760 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
5761 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
5762 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
5763 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
5764 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
5765 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
5766 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
5767 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
5768 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
5769 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10));
5770 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11));
5771 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12));
5772 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13));
5773 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
5774 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
5775 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
5776 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
5777 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0));
5778 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1));
5779 CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2));
5780 CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3));
5781 CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
5782 CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
5783 CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
5784 CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
5785 }
5786
5787 if (bcast_row) {
5788 int n = ggml_nelements(dst)/4;
5789 size_t global_work_size[] = {(size_t)n, 1, 1};
5790 size_t local_work_size[] = {64, 1, 1};
5791
5792 size_t * local_work_size_ptr = local_work_size;
5793 if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
5794 local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
5795 }
5796
5797 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
5798 } else {
5799 unsigned int nth = MIN(64, ne0);
5800 size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
5801 size_t local_work_size[] = {nth, 1, 1};
5802
5803 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5804 }
5805}
5806
5807static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5808 GGML_ASSERT(src0);
5809 GGML_ASSERT(src0->extra);
5810 GGML_ASSERT(src1);
5811 GGML_ASSERT(src1->extra);
5812 GGML_ASSERT(dst);
5813 GGML_ASSERT(dst->extra);
5814
5815 GGML_ASSERT(src0->type == src1->type);
5816 GGML_ASSERT(src0->type == dst->type);
5817 GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
5818
5819 const int ne00 = src0->ne[0];
5820 const int ne01 = src0->ne[1];
5821 const int ne02 = src0->ne[2];
5822 const int ne03 = src0->ne[3];
5823
5824 const cl_ulong nb00 = src0->nb[0];
5825 const cl_ulong nb01 = src0->nb[1];
5826 const cl_ulong nb02 = src0->nb[2];
5827 const cl_ulong nb03 = src0->nb[3];
5828
5829 const int ne10 = src1->ne[0];
5830 const int ne11 = src1->ne[1];
5831 const int ne12 = src1->ne[2];
5832 const int ne13 = src1->ne[3];
5833
5834 const cl_ulong nb10 = src1->nb[0];
5835 const cl_ulong nb11 = src1->nb[1];
5836 const cl_ulong nb12 = src1->nb[2];
5837 const cl_ulong nb13 = src1->nb[3];
5838
5839 const int ne0 = dst->ne[0];
5840
5841 const cl_ulong nb0 = dst->nb[0];
5842 const cl_ulong nb1 = dst->nb[1];
5843 const cl_ulong nb2 = dst->nb[2];
5844 const cl_ulong nb3 = dst->nb[3];
5845
5846 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5847
5848 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5849 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5850 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5851
5852 cl_ulong offset0 = extra0->offset + src0->view_offs;
5853 cl_ulong offset1 = extra1->offset + src1->view_offs;
5854 cl_ulong offsetd = extrad->offset + dst->view_offs;
5855
5856 bool bcast_row = false;
5857 cl_kernel kernel;
5858
5859 if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
5860 GGML_ASSERT(ggml_is_contiguous(src0));
5861
5862 // src1 is a row
5863 GGML_ASSERT(ne11 == 1);
5864
5865 bcast_row = true;
5866 int ne = ne00 / 4;
5867
5868 if (src0->type == GGML_TYPE_F32) {
5869 kernel = backend_ctx->kernel_div_row;
5870 } else {
5871 kernel = backend_ctx->kernel_div_row_f16;
5872 }
5873
5874 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5875 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5876 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5877 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5878 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
5879 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
5880 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
5881 } else {
5882 if (src0->type == GGML_TYPE_F32) {
5883 kernel = backend_ctx->kernel_div;
5884 } else {
5885 kernel = backend_ctx->kernel_div_f16;
5886 }
5887
5888 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5889 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5890 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5891 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5892 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
5893 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
5894 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
5895 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
5896 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
5897 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
5898 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
5899 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
5900 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
5901 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
5902 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
5903 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
5904 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
5905 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
5906 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
5907 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
5908 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
5909 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
5910 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
5911 }
5912
5913 if (bcast_row) {
5914 int n = ggml_nelements(dst)/4;
5915 size_t global_work_size[] = {(size_t)n, 1, 1};
5916 size_t local_work_size[] = {64, 1, 1};
5917
5918 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5919 } else {
5920 unsigned int nth = MIN(64, ne0);
5921 size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
5922 size_t local_work_size[] = {nth, 1, 1};
5923
5924 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
5925 }
5926}
5927
5928static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5929 GGML_ASSERT(src0);
5930 GGML_ASSERT(src0->extra);
5931 GGML_ASSERT(src1);
5932 GGML_ASSERT(src1->extra);
5933 GGML_ASSERT(dst);
5934 GGML_ASSERT(dst->extra);
5935
5936 GGML_ASSERT(src0->type == src1->type);
5937 GGML_ASSERT(src0->type == dst->type);
5938 GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
5939
5940 const int ne00 = src0->ne[0];
5941 const int ne01 = src0->ne[1];
5942 const int ne02 = src0->ne[2];
5943 const int ne03 = src0->ne[3];
5944
5945 const cl_ulong nb00 = src0->nb[0];
5946 const cl_ulong nb01 = src0->nb[1];
5947 const cl_ulong nb02 = src0->nb[2];
5948 const cl_ulong nb03 = src0->nb[3];
5949
5950 const int ne10 = src1->ne[0];
5951 const int ne11 = src1->ne[1];
5952 const int ne12 = src1->ne[2];
5953 const int ne13 = src1->ne[3];
5954
5955 const cl_ulong nb10 = src1->nb[0];
5956 const cl_ulong nb11 = src1->nb[1];
5957 const cl_ulong nb12 = src1->nb[2];
5958 const cl_ulong nb13 = src1->nb[3];
5959
5960 const int ne0 = dst->ne[0];
5961
5962 const cl_ulong nb0 = dst->nb[0];
5963 const cl_ulong nb1 = dst->nb[1];
5964 const cl_ulong nb2 = dst->nb[2];
5965 const cl_ulong nb3 = dst->nb[3];
5966
5967 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5968
5969 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
5970 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5971 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5972
5973 cl_ulong offset0 = extra0->offset + src0->view_offs;
5974 cl_ulong offset1 = extra1->offset + src1->view_offs;
5975 cl_ulong offsetd = extrad->offset + dst->view_offs;
5976
5977 bool bcast_row = false;
5978 cl_kernel kernel;
5979
5980 if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
5981 GGML_ASSERT(ggml_is_contiguous(src0));
5982
5983 // src1 is a row
5984 GGML_ASSERT(ne11 == 1);
5985
5986 bcast_row = true;
5987 int ne = ne00 / 4;
5988
5989 if (src0->type == GGML_TYPE_F32) {
5990 kernel = backend_ctx->kernel_sub_row;
5991 } else {
5992 kernel = backend_ctx->kernel_sub_row_f16;
5993 }
5994
5995 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
5996 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
5997 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5998 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5999 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6000 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6001 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
6002 } else {
6003 if (src0->type == GGML_TYPE_F32) {
6004 kernel = backend_ctx->kernel_sub;
6005 } else {
6006 kernel = backend_ctx->kernel_sub_f16;
6007 }
6008
6009 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6010 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6011 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
6012 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6013 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6014 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6015 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
6016 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
6017 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
6018 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
6019 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
6020 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
6021 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
6022 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
6023 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
6024 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
6025 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
6026 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
6027 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
6028 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
6029 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
6030 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
6031 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
6032 }
6033
6034 if (bcast_row) {
6035 int n = ggml_nelements(dst)/4;
6036 size_t global_work_size[] = {(size_t)n, 1, 1};
6037 size_t local_work_size[] = {64, 1, 1};
6038
6039 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6040 } else {
6041 unsigned int nth = MIN(64, ne0);
6042 size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
6043 size_t local_work_size[] = {nth, 1, 1};
6044
6045 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6046 }
6047}
6048
6049static void ggml_cl_sqr(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6050 GGML_ASSERT(src0);
6051 GGML_ASSERT(src0->extra);
6052 GGML_ASSERT(dst);
6053 GGML_ASSERT(dst->extra);
6054 UNUSED(src1);
6055
6056 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6057
6058 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6059 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6060
6061 cl_ulong offset0 = extra0->offset + src0->view_offs;
6062 cl_ulong offsetd = extrad->offset + dst->view_offs;
6063
6064 cl_kernel kernel;
6065
6066 // Currently assumes src0 is contiguous
6067 int n = ggml_nelements(dst);
6068 if (n % 4 == 0) {
6069 if (src0->type == GGML_TYPE_F32) {
6070 kernel = backend_ctx->kernel_sqr_cont_f32_4;
6071 } else {
6072 kernel = backend_ctx->kernel_sqr_cont_f16_4;
6073 }
6074 n /= 4;
6075 } else {
6076 if (src0->type == GGML_TYPE_F32) {
6077 kernel = backend_ctx->kernel_sqr_cont_f32;
6078 } else {
6079 kernel = backend_ctx->kernel_sqr_cont_f16;
6080 }
6081 }
6082
6083 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6084 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6085 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6086 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6087
6088 size_t global_work_size[] = {(size_t)n, 1, 1};
6089 size_t local_work_size[] = {64, 1, 1};
6090
6091 size_t * local_work_size_ptr = local_work_size;
6092 if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
6093 local_work_size_ptr = nullptr;
6094 }
6095
6096 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
6097}
6098
6099static void ggml_cl_sqrt(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6100 GGML_ASSERT(src0);
6101 GGML_ASSERT(src0->extra);
6102 GGML_ASSERT(dst);
6103 GGML_ASSERT(dst->extra);
6104 UNUSED(src1);
6105
6106 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6107
6108 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6109 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6110
6111 cl_ulong offset0 = extra0->offset + src0->view_offs;
6112 cl_ulong offsetd = extrad->offset + dst->view_offs;
6113
6114 cl_kernel kernel;
6115
6116 // Currently assumes src0 is contiguous
6117 int n = ggml_nelements(dst);
6118 if (n % 4 == 0) {
6119 if (src0->type == GGML_TYPE_F32) {
6120 kernel = backend_ctx->kernel_sqrt_cont_f32_4;
6121 } else {
6122 kernel = backend_ctx->kernel_sqrt_cont_f16_4;
6123 }
6124 n /= 4;
6125 } else {
6126 if (src0->type == GGML_TYPE_F32) {
6127 kernel = backend_ctx->kernel_sqrt_cont_f32;
6128 } else {
6129 kernel = backend_ctx->kernel_sqrt_cont_f16;
6130 }
6131 }
6132
6133 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6134 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6135 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6136 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6137
6138 size_t global_work_size[] = {(size_t)n, 1, 1};
6139 size_t local_work_size[] = {64, 1, 1};
6140
6141 size_t * local_work_size_ptr = local_work_size;
6142 if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
6143 local_work_size_ptr = nullptr;
6144 }
6145
6146 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
6147}
6148
6149static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6150 GGML_ASSERT(src0);
6151 GGML_ASSERT(src0->extra);
6152 GGML_ASSERT(dst);
6153 GGML_ASSERT(dst->extra);
6154 GGML_UNUSED(src1);
6155
6156 GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
6157 GGML_ASSERT(ggml_is_contiguous(src0));
6158
6159 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6160
6161 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6162 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6163
6164 cl_ulong offset0 = extra0->offset + src0->view_offs;
6165 cl_ulong offsetd = extrad->offset + dst->view_offs;
6166
6167 const int ne00 = src0->ne[0];
6168 const int ne01 = src0->ne[1];
6169 const int ne02 = src0->ne[2];
6170 const int ne03 = src0->ne[3];
6171
6172 const cl_ulong nb01 = src0->nb[1];
6173 const cl_ulong nb02 = src0->nb[2];
6174 const cl_ulong nb03 = src0->nb[3];
6175
6176 const cl_ulong nb1 = dst->nb[1];
6177 const cl_ulong nb2 = dst->nb[2];
6178 const cl_ulong nb3 = dst->nb[3];
6179
6180 cl_kernel kernel = backend_ctx->kernel_mean_f32;
6181
6182 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6183 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6184 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6185 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6186 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
6187 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
6188 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
6189 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
6190 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
6191 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
6192 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
6193 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
6194 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
6195 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
6196
6197 size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
6198 size_t local_work_size[] = {(size_t)64, 1, 1};
6199
6200 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6201}
6202
6203static void ggml_cl_ssm_conv(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6204 GGML_ASSERT(src0);
6205 GGML_ASSERT(src0->extra);
6206 GGML_ASSERT(src1);
6207 GGML_ASSERT(src1->extra);
6208 GGML_ASSERT(dst);
6209 GGML_ASSERT(dst->extra);
6210
6211 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6212
6213 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6214 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6215 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6216
6217 cl_ulong offset0 = extra0->offset + src0->view_offs;
6218 cl_ulong offset1 = extra1->offset + src1->view_offs;
6219 cl_ulong offsetd = extrad->offset + dst->view_offs;
6220
6221 int ne01 = src0->ne[1];
6222 cl_ulong nb00 = src0->nb[0];
6223 cl_ulong nb01 = src0->nb[1];
6224 cl_ulong nb02 = src0->nb[2];
6225
6226 int ne10 = src1->ne[0];
6227 cl_ulong nb11 = src1->nb[1];
6228
6229 int ne1 = dst->ne[1];
6230 int ne2 = dst->ne[2];
6231 cl_ulong nb0 = dst->nb[0];
6232 cl_ulong nb1 = dst->nb[1];
6233 cl_ulong nb2 = dst->nb[2];
6234
6235 cl_kernel kernel = backend_ctx->kernel_ssm_conv_f32_f32;
6236
6237 if (ne10 % 4 == 0) {
6238 kernel = backend_ctx->kernel_ssm_conv_f32_f32_4;
6239 }
6240
6241 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6242 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6243 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
6244 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6245 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6246 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6247 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
6248 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
6249 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
6250 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
6251 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb11));
6252 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb0));
6253 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1));
6254 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2));
6255
6256 size_t global_work_size[] = {(size_t)ne01, (size_t)ne1, (size_t)ne2};
6257 size_t local_work_size[] = {64, 1, 1};
6258
6259 size_t * local_work_size_ptr = local_work_size;
6260 if (ne01 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
6261 local_work_size_ptr = nullptr;
6262 }
6263
6264 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
6265}
6266
6267static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6268 GGML_ASSERT(src0);
6269 GGML_ASSERT(src0->extra);
6270 GGML_ASSERT(dst);
6271 GGML_ASSERT(dst->extra);
6272
6273 UNUSED(src1);
6274
6275 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6276
6277 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6278 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6279
6280 cl_ulong offset0 = extra0->offset + src0->view_offs;
6281 cl_ulong offsetd = extrad->offset + dst->view_offs;
6282
6283 cl_kernel kernel;
6284
6285 int n = ggml_nelements(dst);
6286
6287 if (n % 4 == 0) {
6288 kernel = backend_ctx->kernel_gelu_4;
6289 n /= 4;
6290 } else {
6291 kernel = backend_ctx->kernel_gelu;
6292 }
6293
6294 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6295 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6296 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6297 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6298
6299 size_t global_work_size[] = {(size_t)n, 1, 1};
6300 size_t local_work_size[] = {64, 1, 1};
6301
6302 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6303}
6304
6305static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6306 GGML_ASSERT(src0);
6307 GGML_ASSERT(src0->extra);
6308 GGML_ASSERT(dst);
6309 GGML_ASSERT(dst->extra);
6310
6311 UNUSED(src1);
6312
6313 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6314
6315 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6316 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6317
6318 cl_ulong offset0 = extra0->offset + src0->view_offs;
6319 cl_ulong offsetd = extrad->offset + dst->view_offs;
6320
6321 cl_kernel kernel;
6322
6323 int n = ggml_nelements(dst);
6324
6325 if (n % 4 == 0) {
6326 kernel = backend_ctx->kernel_gelu_erf_4;
6327 n /= 4;
6328 } else {
6329 kernel = backend_ctx->kernel_gelu_erf;
6330 }
6331
6332 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6333 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6334 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6335 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6336
6337 size_t global_work_size[] = {(size_t)n, 1, 1};
6338 size_t local_work_size[] = {64, 1, 1};
6339
6340 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6341}
6342
6343static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6344 GGML_ASSERT(src0);
6345 GGML_ASSERT(src0->extra);
6346 GGML_ASSERT(dst);
6347 GGML_ASSERT(dst->extra);
6348
6349 UNUSED(src1);
6350
6351 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6352
6353 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6354 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6355
6356 cl_ulong offset0 = extra0->offset + src0->view_offs;
6357 cl_ulong offsetd = extrad->offset + dst->view_offs;
6358
6359 cl_kernel kernel;
6360
6361 int n = ggml_nelements(dst);
6362
6363 if (n % 4 == 0) {
6364 kernel = backend_ctx->kernel_gelu_quick_4;
6365 n /= 4;
6366 } else {
6367 kernel = backend_ctx->kernel_gelu_quick;
6368 }
6369
6370 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6371 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6372 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6373 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6374
6375 size_t global_work_size[] = {(size_t)n, 1, 1};
6376 size_t local_work_size[] = {64, 1, 1};
6377
6378 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6379}
6380
6381static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6382 GGML_ASSERT(src0);
6383 GGML_ASSERT(src0->extra);
6384 GGML_ASSERT(dst);
6385 GGML_ASSERT(dst->extra);
6386
6387 UNUSED(src1);
6388
6389 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6390
6391 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6392 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6393
6394 cl_ulong offset0 = extra0->offset + src0->view_offs;
6395 cl_ulong offsetd = extrad->offset + dst->view_offs;
6396
6397 cl_kernel kernel;
6398
6399 int n = ggml_nelements(dst);
6400
6401 if (n % 4 == 0) {
6402 kernel = backend_ctx->kernel_silu_4;
6403 n /= 4;
6404 } else {
6405 kernel = backend_ctx->kernel_silu;
6406 }
6407
6408 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6409 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6410 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6411 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6412
6413 size_t global_work_size[] = {(size_t)n, 1, 1};
6414 size_t local_work_size[] = {64, 1, 1};
6415
6416 size_t * local_work_size_ptr = local_work_size;
6417 if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
6418 local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
6419 }
6420
6421 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
6422}
6423
6424static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6425 GGML_ASSERT(src0);
6426 GGML_ASSERT(src0->extra);
6427 GGML_ASSERT(dst);
6428 GGML_ASSERT(dst->extra);
6429
6430 UNUSED(src1);
6431
6432 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6433
6434 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6435 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6436
6437 cl_ulong offset0 = extra0->offset + src0->view_offs;
6438 cl_ulong offsetd = extrad->offset + dst->view_offs;
6439
6440 cl_kernel kernel = backend_ctx->kernel_relu;
6441
6442 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6443 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6444 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6445 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6446
6447 const int64_t n = ggml_nelements(dst);
6448
6449 size_t global_work_size[] = {(size_t)n, 1, 1};
6450 size_t local_work_size[] = {64, 1, 1};
6451
6452 size_t * local_work_size_ptr = local_work_size;
6453 if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
6454 local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
6455 }
6456
6457 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
6458}
6459
6460static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6461 GGML_ASSERT(src0);
6462 GGML_ASSERT(src0->extra);
6463 GGML_ASSERT(dst);
6464 GGML_ASSERT(dst->extra);
6465
6466 UNUSED(src1);
6467
6468 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6469
6470 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6471 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6472
6473 cl_ulong offset0 = extra0->offset + src0->view_offs;
6474 cl_ulong offsetd = extrad->offset + dst->view_offs;
6475
6476 cl_kernel kernel;
6477 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
6478 kernel = backend_ctx->kernel_sigmoid_f32;
6479 } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
6480 kernel = backend_ctx->kernel_sigmoid_f16;
6481 } else {
6482 GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
6483 }
6484
6485 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6486 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6487 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6488 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6489
6490 const int64_t n = ggml_nelements(dst);
6491
6492 size_t global_work_size[] = {(size_t)n, 1, 1};
6493 size_t local_work_size[] = {64, 1, 1};
6494
6495 size_t * local_work_size_ptr = local_work_size;
6496 if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
6497 local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
6498 }
6499
6500 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
6501}
6502
6503static void ggml_cl_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6504 GGML_ASSERT(src0);
6505 GGML_ASSERT(src0->extra);
6506 GGML_ASSERT(dst);
6507 GGML_ASSERT(dst->extra);
6508
6509 UNUSED(src1);
6510
6511 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6512
6513 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6514 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6515
6516 cl_ulong offset0 = extra0->offset + src0->view_offs;
6517 cl_ulong offsetd = extrad->offset + dst->view_offs;
6518
6519 const int tri_type = ggml_get_op_params_i32(dst, 0);
6520 const int64_t n = ggml_nelements(dst);
6521 const int ne0 = dst->ne[0];
6522 const int ne1 = dst->ne[1];
6523
6524 cl_kernel kernel = backend_ctx->kernel_tri;
6525
6526 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6527 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6528 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6529 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6530 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &n));
6531 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne0));
6532 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne1));
6533 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &tri_type));
6534
6535 size_t local_work_size[1] = { 256 };
6536 size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] };
6537
6538 backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
6539}
6540
6541static void ggml_cl_fill(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6542 GGML_ASSERT(dst);
6543 GGML_ASSERT(dst->extra);
6544
6545 UNUSED(src0);
6546 UNUSED(src1);
6547
6548 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6549
6550 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6551 cl_ulong offsetd = extrad->offset + dst->view_offs;
6552
6553 float v = 0.0f;
6554 memcpy(&v, ((int32_t *) dst->op_params), sizeof(float));
6555
6556 const int64_t n = ggml_nelements(dst);
6557
6558 cl_kernel kernel = backend_ctx->kernel_fill;
6559
6560 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extrad->data_device));
6561 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offsetd));
6562 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(float), &v));
6563 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(float), &n));
6564
6565 size_t local_work_size[1] = { 256 };
6566 size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] };
6567
6568 backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
6569}
6570
6571static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6572 GGML_ASSERT(src0);
6573 GGML_ASSERT(src0->extra);
6574 GGML_ASSERT(dst);
6575 GGML_ASSERT(dst->extra);
6576
6577 UNUSED(src1);
6578
6579 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6580
6581 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6582 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6583
6584 cl_ulong offset0 = extra0->offset + src0->view_offs;
6585 cl_ulong offsetd = extrad->offset + dst->view_offs;
6586
6587 float min;
6588 float max;
6589 memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
6590 memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
6591
6592 cl_kernel kernel = backend_ctx->kernel_clamp;
6593
6594 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6595 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6596 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6597 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6598 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &min));
6599 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &max));
6600
6601 const int64_t n = ggml_nelements(dst);
6602
6603 size_t global_work_size[] = {(size_t)n, 1, 1};
6604 size_t local_work_size[] = {64, 1, 1};
6605
6606 size_t * local_work_size_ptr = local_work_size;
6607 if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
6608 local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
6609 }
6610
6611 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
6612}
6613
6614static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6615 GGML_ASSERT(src0);
6616 GGML_ASSERT(src0->extra);
6617 GGML_ASSERT(dst);
6618 GGML_ASSERT(dst->extra);
6619
6620 UNUSED(src1);
6621
6622 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6623
6624 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6625 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6626
6627 cl_ulong offset0 = extra0->offset + src0->view_offs;
6628 cl_ulong offsetd = extrad->offset + dst->view_offs;
6629
6630 float eps;
6631 memcpy(&eps, dst->op_params, sizeof(float));
6632
6633 const int ne00 = src0 ? src0->ne[0] : 0;
6634 const int ne01 = src0 ? src0->ne[1] : 0;
6635 const int ne02 = src0 ? src0->ne[2] : 0;
6636 const int ne03 = src0 ? src0->ne[3] : 0;
6637
6638 const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
6639 const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
6640 const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
6641
6642 const int nth = MIN(64, ne00);
6643
6644 cl_kernel kernel = backend_ctx->kernel_norm;
6645
6646 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6647 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6648 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6649 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6650 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
6651 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
6652 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
6653 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
6654 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
6655 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
6656 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
6657 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
6658 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
6659
6660 size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
6661 size_t local_work_size[] = {(size_t)nth, 1, 1};
6662
6663 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6664}
6665
6666static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6667 GGML_ASSERT(src0);
6668 GGML_ASSERT(src0->extra);
6669 GGML_ASSERT(dst);
6670 GGML_ASSERT(dst->extra);
6671
6672 UNUSED(src1);
6673
6674 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6675
6676 //ggml_backend_opencl_device_context * dev_ctx =
6677 // (ggml_backend_opencl_device_context *)backend->device->context;
6678
6679 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6680 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6681
6682 cl_ulong offset0 = extra0->offset + src0->view_offs;
6683 cl_ulong offsetd = extrad->offset + dst->view_offs;
6684
6685 float eps;
6686 memcpy(&eps, dst->op_params, sizeof(float));
6687
6688 const int ne00 = src0 ? src0->ne[0] : 0;
6689 const int ne01 = src0 ? src0->ne[1] : 0;
6690 const int ne02 = src0 ? src0->ne[2] : 0;
6691 const int ne03 = src0 ? src0->ne[3] : 0;
6692
6693 const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
6694 const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
6695 const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
6696
6697 GGML_ASSERT(ne00 % 4 == 0);
6698
6699 const int nth = MIN(64, ne00);
6700
6701 size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
6702 size_t local_work_size[] = {(size_t)nth, 1, 1};
6703
6704 cl_kernel kernel = backend_ctx->kernel_rms_norm;
6705
6706 // Note, this kernel declares local memory in kernel args and the size
6707 // depends on subgroup size.
6708 // Note, this requires OpenCL 2.1 and above
6709 // For now we use fixed subgroup size to simplify support for OpenCL 2.0.
6710 size_t sgs;
6711 //CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
6712 // CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
6713 // sizeof(local_work_size), local_work_size,
6714 // sizeof(size_t), &sgs, NULL));
6715 if (backend_ctx->gpu_family == ADRENO) {
6716 sgs = 64;
6717 } else if (backend_ctx->gpu_family == INTEL) {
6718 sgs = 32;
6719 } else {
6720 GGML_ASSERT(false && "Unsupported GPU");
6721 }
6722
6723 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6724 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6725 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
6726 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
6727 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
6728 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
6729 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
6730 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
6731 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
6732 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
6733 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
6734 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
6735 // This is local memory - the size depends on subgroup size.
6736 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
6737
6738 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6739}
6740
6741static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor) {
6742 GGML_ASSERT(mul_tensor);
6743 GGML_ASSERT(rms_norm_tensor);
6744
6745 // src0 is the src of rms_norm, src1 is the other src of mul (one being rms_norm)
6746 const ggml_tensor * src0 = rms_norm_tensor->src[0];
6747 const ggml_tensor * src1;
6748 if (mul_tensor->src[0] == rms_norm_tensor) {
6749 src1 = mul_tensor->src[1];
6750 } else if (mul_tensor->src[1] == rms_norm_tensor) {
6751 src1 = mul_tensor->src[0];
6752 } else {
6753 GGML_ASSERT(false && "Invalid args for rms_norm and mul");
6754 }
6755 const ggml_tensor * dst = mul_tensor;
6756
6757 GGML_ASSERT(src0);
6758 GGML_ASSERT(src0->extra);
6759 GGML_ASSERT(src1);
6760 GGML_ASSERT(src1->extra);
6761 GGML_ASSERT(dst);
6762 GGML_ASSERT(dst->extra);
6763
6764 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6765 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6766 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6767
6768 cl_ulong offset0 = extra0->offset + src0->view_offs;
6769 cl_ulong offset1 = extra1->offset + src0->view_offs;
6770 cl_ulong offsetd = extrad->offset + dst->view_offs;
6771
6772 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6773
6774 float eps;
6775 memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
6776
6777 const int ne00 = src0->ne[0];
6778 const int ne01 = src0->ne[1];
6779 const int ne02 = src0->ne[2];
6780 const int ne03 = src0->ne[3];
6781
6782 const cl_ulong nb01 = src0->nb[1];
6783 const cl_ulong nb02 = src0->nb[2];
6784 const cl_ulong nb03 = src0->nb[3];
6785
6786 const int ne10 = src1->ne[0];
6787 const int ne11 = src1->ne[1];
6788 const int ne12 = src1->ne[2];
6789 const int ne13 = src1->ne[3];
6790
6791 const cl_ulong nb11 = src1->nb[1];
6792 const cl_ulong nb12 = src1->nb[2];
6793 const cl_ulong nb13 = src1->nb[3];
6794
6795 const cl_ulong nb1 = dst->nb[1];
6796 const cl_ulong nb2 = dst->nb[2];
6797 const cl_ulong nb3 = dst->nb[3];
6798
6799 GGML_ASSERT(ne00 % 4 == 0);
6800
6801 size_t sgs;
6802 if (backend_ctx->gpu_family == ADRENO) {
6803 sgs = 64;
6804 } else if (backend_ctx->gpu_family == INTEL) {
6805 sgs = 32;
6806 } else {
6807 GGML_ASSERT(false && "Unsupported GPU");
6808 }
6809
6810 cl_kernel kernel = backend_ctx->kernel_rms_norm_mul;
6811
6812 int nth = sgs;
6813 int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
6814 while (nth < ne00 && nth < max_workgroup_size) {
6815 nth *= 2;
6816 }
6817 nth = MIN(nth, max_workgroup_size);
6818 nth = MIN(nth, ne00);
6819
6820 size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
6821 size_t local_work_size[] = {(size_t)nth, 1, 1};
6822
6823 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6824 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6825 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
6826 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6827 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
6828 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
6829 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
6830 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
6831 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
6832 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
6833 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
6834 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
6835 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
6836 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
6837 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
6838 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
6839 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne13));
6840 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
6841 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
6842 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
6843 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
6844 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
6845 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
6846 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &eps));
6847 CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs, NULL));
6848
6849 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
6850}
6851
6852static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) {
6853 GGML_ASSERT(norm_tensor && mul_tensor && add_tensor);
6854
6855 const ggml_tensor * src0 = norm_tensor->src[0];
6856 const ggml_tensor * src1 = mul_tensor->src[0] == norm_tensor ? mul_tensor->src[1] : mul_tensor->src[0];
6857 const ggml_tensor * src2 = add_tensor->src[0] == mul_tensor ? add_tensor->src[1] : add_tensor->src[0];
6858 const ggml_tensor * dst = add_tensor;
6859
6860 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6861 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6862 ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
6863 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6864
6865 cl_ulong offset0 = extra0->offset + src0->view_offs;
6866 cl_ulong offset1 = extra1->offset + src1->view_offs;
6867 cl_ulong offset2 = extra2->offset + src2->view_offs;
6868 cl_ulong offsetd = extrad->offset + dst->view_offs;
6869
6870 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6871
6872 float eps;
6873 memcpy(&eps, norm_tensor->op_params, sizeof(float));
6874
6875 const int ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
6876 const cl_ulong nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
6877 const int ne10 = src1->ne[0], ne11 = src1->ne[1], ne12 = src1->ne[2], ne13 = src1->ne[3];
6878 const cl_ulong nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
6879 const int ne20 = src2->ne[0], ne21 = src2->ne[1], ne22 = src2->ne[2], ne23 = src2->ne[3];
6880 const cl_ulong nb21 = src2->nb[1], nb22 = src2->nb[2], nb23 = src2->nb[3];
6881 const cl_ulong nbd1 = dst->nb[1], nbd2 = dst->nb[2], nbd3 = dst->nb[3];
6882
6883 size_t sgs;
6884 if (backend_ctx->gpu_family == ADRENO) sgs = 64;
6885 else if (backend_ctx->gpu_family == INTEL) sgs = 32;
6886 else GGML_ASSERT(false && "Unsupported GPU");
6887
6888 cl_kernel kernel = backend_ctx->kernel_norm_mul_add;
6889
6890 int nth = sgs;
6891 int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
6892 while (nth < ne00/4 && nth < max_workgroup_size) nth *= 2;
6893 nth = MIN(nth, max_workgroup_size);
6894 nth = MIN(nth, ne00/4);
6895
6896 size_t gws[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
6897 size_t lws[] = {(size_t)nth, 1, 1};
6898 size_t num_subgroups = (nth + sgs - 1) / sgs;
6899
6900 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6901 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6902 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
6903 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6904 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
6905 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
6906 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
6907 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
6908 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
6909 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
6910 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
6911 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne03));
6912 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb01));
6913 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb02));
6914 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb03));
6915 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne10));
6916 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne11));
6917 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne12));
6918 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne13));
6919 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
6920 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
6921 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
6922 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne20));
6923 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne21));
6924 CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne22));
6925 CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne23));
6926 CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb21));
6927 CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb22));
6928 CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb23));
6929 CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nbd1));
6930 CL_CHECK(clSetKernelArg(kernel, 30, sizeof(cl_ulong), &nbd2));
6931 CL_CHECK(clSetKernelArg(kernel, 31, sizeof(cl_ulong), &nbd3));
6932 CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &eps));
6933 CL_CHECK(clSetKernelArg(kernel, 33, sizeof(cl_float2) * num_subgroups, NULL));
6934
6935 backend_ctx->enqueue_ndrange_kernel(kernel, 3, gws, lws, dst);
6936}
6937
6938static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) {
6939 GGML_ASSERT(gn_tensor && mul_tensor && add_tensor);
6940
6941 const ggml_tensor * src0 = gn_tensor->src[0];
6942 const ggml_tensor * src1 = mul_tensor->src[0] == gn_tensor ? mul_tensor->src[1] : mul_tensor->src[0];
6943 const ggml_tensor * src2 = add_tensor->src[0] == mul_tensor ? add_tensor->src[1] : add_tensor->src[0];
6944 const ggml_tensor * dst = add_tensor;
6945
6946 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6947 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
6948 ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
6949 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6950
6951 cl_ulong offset0 = extra0->offset + src0->view_offs;
6952 cl_ulong offset1 = extra1->offset + src1->view_offs;
6953 cl_ulong offset2 = extra2->offset + src2->view_offs;
6954 cl_ulong offsetd = extrad->offset + dst->view_offs;
6955
6956 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6957
6958 int groups;
6959 float eps;
6960 memcpy(&groups, gn_tensor->op_params, sizeof(int));
6961 memcpy(&eps, (char *)gn_tensor->op_params + sizeof(int), sizeof(float));
6962
6963 cl_kernel kernel = backend_ctx->kernel_group_norm_mul_add;
6964 int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
6965 int ne = ggml_nelements(src0);
6966 int group_size = ne / groups;
6967
6968 size_t lws[] = { (size_t)MIN(max_workgroup_size, group_size) };
6969 size_t gws[] = { (size_t)groups * lws[0] };
6970
6971 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
6972 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
6973 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
6974 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
6975 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
6976 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
6977 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
6978 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
6979 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne));
6980 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &group_size));
6981 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &eps));
6982
6983 backend_ctx->enqueue_ndrange_kernel(kernel, 1, gws, lws, dst);
6984}
6985
6986static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6987 GGML_ASSERT(src0);
6988 GGML_ASSERT(src0->extra);
6989 GGML_ASSERT(dst);
6990 GGML_ASSERT(dst->extra);
6991
6992 UNUSED(src1);
6993
6994 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
6995
6996 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
6997 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
6998
6999 cl_ulong offset0 = extra0->offset + src0->view_offs;
7000 cl_ulong offsetd = extrad->offset + dst->view_offs;
7001
7002 int32_t n_groups = ((const int32_t *) dst->op_params)[0];
7003 int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
7004 float eps = ((const float *) dst->op_params)[1];
7005
7006 const int ne00 = src0->ne[0];
7007 const int ne01 = src0->ne[1];
7008 const int ne02 = src0->ne[2];
7009 const int ne = ne00*ne01*ne02;
7010
7011 cl_kernel kernel = backend_ctx->kernel_group_norm;
7012
7013 size_t sgs = 64;
7014 if (backend_ctx->gpu_family == ADRENO) {
7015 sgs = 64;
7016 } else if (backend_ctx->gpu_family == INTEL) {
7017 sgs = 32;
7018 } else {
7019 GGML_ASSERT(false && "Unsupported GPU");
7020 }
7021
7022 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7023 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7024 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7025 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7026 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne));
7027 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &group_size));
7028 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
7029
7030 size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
7031 size_t local_work_size[] = {(size_t)sgs, 1, 1};
7032
7033 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
7034}
7035
7036static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7037 GGML_ASSERT(src0);
7038 GGML_ASSERT(src0->extra);
7039 GGML_ASSERT(dst);
7040 GGML_ASSERT(dst->extra);
7041
7042 UNUSED(src1);
7043
7044 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7045
7046 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
7047 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
7048
7049 cl_ulong offset0 = extra0->offset + src0->view_offs;
7050 cl_ulong offsetd = extrad->offset + dst->view_offs;
7051
7052 const int ne00 = src0->ne[0];
7053 const int ne01 = src0->ne[1];
7054 const int ne02 = src0->ne[2];
7055 const int ne03 = src0->ne[3];
7056
7057 const cl_ulong nb00 = src0->nb[0];
7058 const cl_ulong nb01 = src0->nb[1];
7059 const cl_ulong nb02 = src0->nb[2];
7060 const cl_ulong nb03 = src0->nb[3];
7061
7062 const cl_ulong nb0 = dst->nb[0];
7063 const cl_ulong nb1 = dst->nb[1];
7064 const cl_ulong nb2 = dst->nb[2];
7065 const cl_ulong nb3 = dst->nb[3];
7066
7067 cl_kernel kernel;
7068
7069 if (ggml_is_contiguous(src0)) {
7070 // Handle contiguous input
7071 int n = ggml_nelements(dst);
7072 if (n % 4 == 0) {
7073 if (src0->type == GGML_TYPE_F32) {
7074 kernel = backend_ctx->kernel_tanh_f32_4;
7075 } else {
7076 kernel = backend_ctx->kernel_tanh_f16_4;
7077 }
7078 n /= 4;
7079 } else {
7080 if (src0->type == GGML_TYPE_F32) {
7081 kernel = backend_ctx->kernel_tanh_f32;
7082 } else {
7083 kernel = backend_ctx->kernel_tanh_f16;
7084 }
7085 }
7086
7087 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7088 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7089 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7090 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7091
7092 size_t global_work_size[] = {(size_t)n, 1, 1};
7093 size_t local_work_size[] = {64, 1, 1};
7094
7095 size_t * local_work_size_ptr = local_work_size;
7096 if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
7097 local_work_size_ptr = nullptr;
7098 }
7099
7100 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
7101 } else {
7102 // Handle non-contiguous input
7103 if (src0->type == GGML_TYPE_F32) {
7104 kernel = backend_ctx->kernel_tanh_f32_nc;
7105 } else {
7106 kernel = backend_ctx->kernel_tanh_f16_nc;
7107 }
7108
7109 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7110 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7111 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7112 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7113 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
7114 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb00));
7115 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
7116 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb02));
7117 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb03));
7118 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb0));
7119 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
7120 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
7121 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
7122
7123 int nth = 64;
7124
7125 size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
7126 size_t local_work_size[] = {(size_t)nth, 1, 1};
7127
7128 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
7129 }
7130}
7131
7132static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7133 GGML_ASSERT(src0);
7134 GGML_ASSERT(src0->extra);
7135 GGML_ASSERT(dst);
7136 GGML_ASSERT(dst->extra);
7137
7138 UNUSED(src1);
7139
7140 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7141
7142 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
7143 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
7144
7145 cl_ulong offset0_abs = extra0->offset + src0->view_offs;
7146 cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
7147
7148 cl_kernel kernel;
7149 if (dst->type == GGML_TYPE_F32) {
7150 kernel = backend_ctx->kernel_expm1_f32_nd;
7151 } else if (dst->type == GGML_TYPE_F16) {
7152 kernel = backend_ctx->kernel_expm1_f16_nd;
7153 } else {
7154 GGML_ASSERT(false && "Unsupported type for ggml_cl_expm1");
7155 }
7156 GGML_ASSERT(kernel != nullptr);
7157
7158 const int ne00 = src0->ne[0];
7159 const int ne01 = src0->ne[1];
7160 const int ne02 = src0->ne[2];
7161 const int ne03 = src0->ne[3];
7162
7163 const cl_ulong nb00 = src0->nb[0];
7164 const cl_ulong nb01 = src0->nb[1];
7165 const cl_ulong nb02 = src0->nb[2];
7166 const cl_ulong nb03 = src0->nb[3];
7167
7168 const int ne10 = dst->ne[0];
7169 const int ne11 = dst->ne[1];
7170 const int ne12 = dst->ne[2];
7171 const int ne13 = dst->ne[3];
7172
7173 const cl_ulong nb10 = dst->nb[0];
7174 const cl_ulong nb11 = dst->nb[1];
7175 const cl_ulong nb12 = dst->nb[2];
7176 const cl_ulong nb13 = dst->nb[3];
7177
7178 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7179 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
7180 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7181 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
7182
7183 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
7184 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
7185 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
7186 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
7187 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
7188 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
7189 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
7190 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
7191
7192 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
7193 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
7194 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
7195 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
7196 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
7197 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
7198 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
7199 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
7200
7201 size_t global_work_size[3];
7202 if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
7203 return;
7204 }
7205 global_work_size[0] = (size_t)ne10;
7206 global_work_size[1] = (size_t)ne11;
7207 global_work_size[2] = (size_t)ne12;
7208
7209 size_t lws0 = 16, lws1 = 4, lws2 = 1;
7210 if (ne10 < 16) lws0 = ne10;
7211 if (ne11 < 4) lws1 = ne11;
7212 if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
7213
7214 while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
7215 while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
7216 while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
7217
7218
7219 size_t local_work_size[] = {lws0, lws1, lws2};
7220
7221 size_t* local_work_size_ptr = local_work_size;
7222 if (!backend_ctx->non_uniform_workgroups) {
7223 if (global_work_size[0] % local_work_size[0] != 0 ||
7224 global_work_size[1] % local_work_size[1] != 0 ||
7225 global_work_size[2] % local_work_size[2] != 0) {
7226 local_work_size_ptr = NULL;
7227 }
7228 }
7229 if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
7230
7231 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
7232}
7233
7234static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7235 GGML_ASSERT(src0);
7236 GGML_ASSERT(src0->extra);
7237 GGML_ASSERT(dst);
7238 GGML_ASSERT(dst->extra);
7239
7240 UNUSED(src1);
7241
7242 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7243
7244 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
7245 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
7246
7247 cl_ulong offset0_abs = extra0->offset + src0->view_offs;
7248 cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
7249
7250 cl_kernel kernel;
7251 if (dst->type == GGML_TYPE_F32) {
7252 kernel = backend_ctx->kernel_softplus_f32_nd;
7253 } else if (dst->type == GGML_TYPE_F16) {
7254 kernel = backend_ctx->kernel_softplus_f16_nd;
7255 } else {
7256 GGML_ASSERT(false && "Unsupported type for ggml_cl_softplus");
7257 }
7258 GGML_ASSERT(kernel != nullptr);
7259
7260 const int ne00 = src0->ne[0];
7261 const int ne01 = src0->ne[1];
7262 const int ne02 = src0->ne[2];
7263 const int ne03 = src0->ne[3];
7264
7265 const cl_ulong nb00 = src0->nb[0];
7266 const cl_ulong nb01 = src0->nb[1];
7267 const cl_ulong nb02 = src0->nb[2];
7268 const cl_ulong nb03 = src0->nb[3];
7269
7270 const int ne10 = dst->ne[0];
7271 const int ne11 = dst->ne[1];
7272 const int ne12 = dst->ne[2];
7273 const int ne13 = dst->ne[3];
7274
7275 const cl_ulong nb10 = dst->nb[0];
7276 const cl_ulong nb11 = dst->nb[1];
7277 const cl_ulong nb12 = dst->nb[2];
7278 const cl_ulong nb13 = dst->nb[3];
7279
7280 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7281 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
7282 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7283 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
7284
7285 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
7286 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
7287 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
7288 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
7289 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
7290 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
7291 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
7292 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
7293
7294 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
7295 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
7296 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
7297 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
7298 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
7299 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
7300 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
7301 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
7302
7303 size_t global_work_size[3];
7304 if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
7305 return;
7306 }
7307 global_work_size[0] = (size_t)ne10;
7308 global_work_size[1] = (size_t)ne11;
7309 global_work_size[2] = (size_t)ne12;
7310
7311 size_t lws0 = 16, lws1 = 4, lws2 = 1;
7312 if (ne10 < 16) lws0 = ne10;
7313 if (ne11 < 4) lws1 = ne11;
7314 if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
7315
7316 while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
7317 while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
7318 while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
7319
7320
7321 size_t local_work_size[] = {lws0, lws1, lws2};
7322
7323 size_t* local_work_size_ptr = local_work_size;
7324 if (!backend_ctx->non_uniform_workgroups) {
7325 if (global_work_size[0] % local_work_size[0] != 0 ||
7326 global_work_size[1] % local_work_size[1] != 0 ||
7327 global_work_size[2] % local_work_size[2] != 0) {
7328 local_work_size_ptr = NULL;
7329 }
7330 }
7331 if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
7332
7333 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
7334}
7335
7336static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
7337 GGML_ASSERT(src0);
7338 GGML_ASSERT(src0->extra);
7339 GGML_ASSERT(dst);
7340 GGML_ASSERT(dst->extra);
7341 GGML_ASSERT(dst->type == src0->type);
7342
7343 UNUSED(src1_shape_def);
7344
7345 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7346
7347 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
7348 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
7349
7350 cl_ulong offset0 = extra0->offset + src0->view_offs;
7351 cl_ulong offsetd = extrad->offset + dst->view_offs;
7352
7353 const int ne00 = src0->ne[0];
7354 const int ne01 = src0->ne[1];
7355 const int ne02 = src0->ne[2];
7356 const int ne03 = src0->ne[3];
7357
7358 const cl_ulong nb00 = src0->nb[0];
7359 const cl_ulong nb01 = src0->nb[1];
7360 const cl_ulong nb02 = src0->nb[2];
7361 const cl_ulong nb03 = src0->nb[3];
7362
7363 const int ne0 = dst->ne[0];
7364 const int ne1 = dst->ne[1];
7365 const int ne2 = dst->ne[2];
7366 const int ne3 = dst->ne[3];
7367
7368 const cl_ulong nb0 = dst->nb[0];
7369 const cl_ulong nb1 = dst->nb[1];
7370 const cl_ulong nb2 = dst->nb[2];
7371 const cl_ulong nb3 = dst->nb[3];
7372
7373 cl_kernel kernel = backend_ctx->kernel_repeat_f32;
7374
7375 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7376 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7377 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
7378 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
7379 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
7380 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
7381 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
7382 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
7383 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
7384 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
7385 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
7386 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
7387 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0));
7388 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb0));
7389 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
7390 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
7391 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
7392
7393 int nth = 64;
7394
7395 size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
7396 size_t local_work_size[] = {(size_t)nth, 1, 1};
7397
7398 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
7399}
7400
7401static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
7402 GGML_ASSERT(src0);
7403 GGML_ASSERT(src0->extra);
7404 GGML_ASSERT(dst);
7405 GGML_ASSERT(dst->extra);
7406 GGML_ASSERT(src0->type == GGML_TYPE_F32);
7407 GGML_ASSERT(dst->type == GGML_TYPE_F32);
7408
7409 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7410
7411 if (backend_ctx->kernel_pad == nullptr) {
7412 GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
7413 return;
7414 }
7415
7416 ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
7417 ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
7418
7419 cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
7420 cl_ulong off_dst = extra_dst->offset + dst->view_offs;
7421
7422 const int s_ne0 = src0->ne[0];
7423 const int s_ne1 = src0->ne[1];
7424 const int s_ne2 = src0->ne[2];
7425 const int s_ne3 = src0->ne[3];
7426
7427 const int s_nb0 = src0->nb[0];
7428 const int s_nb1 = src0->nb[1];
7429 const int s_nb2 = src0->nb[2];
7430 const int s_nb3 = src0->nb[3];
7431
7432 const int d_ne0 = dst->ne[0];
7433 const int d_ne1 = dst->ne[1];
7434 const int d_ne2 = dst->ne[2];
7435 const int d_ne3 = dst->ne[3];
7436
7437 const int d_nb0 = dst->nb[0];
7438 const int d_nb1 = dst->nb[1];
7439 const int d_nb2 = dst->nb[2];
7440 const int d_nb3 = dst->nb[3];
7441
7442 const int lp0 = ((const int*)(dst->op_params))[0];
7443 const int rp0 = ((const int*)(dst->op_params))[1];
7444 const int lp1 = ((const int*)(dst->op_params))[2];
7445 const int rp1 = ((const int*)(dst->op_params))[3];
7446 const int lp2 = ((const int*)(dst->op_params))[4];
7447 const int rp2 = ((const int*)(dst->op_params))[5];
7448 const int lp3 = ((const int*)(dst->op_params))[6];
7449 const int rp3 = ((const int*)(dst->op_params))[7];
7450
7451 cl_kernel kernel = backend_ctx->kernel_pad;
7452
7453 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
7454 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
7455 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
7456 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
7457 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
7458 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
7459 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
7460 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &s_ne3));
7461 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &s_nb0));
7462 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &s_nb1));
7463 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &s_nb2));
7464 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &s_nb3));
7465 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
7466 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
7467 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
7468 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &d_ne3));
7469 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &d_nb0));
7470 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &d_nb1));
7471 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &d_nb2));
7472 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &d_nb3));
7473 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &lp0));
7474 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &rp0));
7475 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &lp1));
7476 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &rp1));
7477 CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &lp2));
7478 CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &rp2));
7479 CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &lp3));
7480 CL_CHECK(clSetKernelArg(kernel, 27, sizeof(int), &rp3));
7481
7482 size_t lws0 = 64;
7483 size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
7484
7485 size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2*d_ne3 };
7486 size_t local_work_size[] = { lws0, 1, 1 };
7487
7488 size_t * local_work_size_ptr = local_work_size;
7489 if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
7490 local_work_size_ptr = nullptr;
7491 }
7492
7493 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
7494}
7495
7496static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
7497 GGML_ASSERT(src0);
7498 GGML_ASSERT(src0->extra);
7499 GGML_ASSERT(dst);
7500 GGML_ASSERT(dst->extra);
7501 GGML_ASSERT(src0->type == GGML_TYPE_F32);
7502 GGML_ASSERT(dst->type == GGML_TYPE_F32);
7503
7504 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7505
7506 const int mode_flags = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
7507 const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
7508 cl_kernel kernel = nullptr;
7509
7510 if (mode == GGML_SCALE_MODE_NEAREST) {
7511 kernel = backend_ctx->kernel_upscale;
7512 if (kernel == nullptr) {
7513 GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__);
7514 return;
7515 }
7516 } else if (mode == GGML_SCALE_MODE_BILINEAR) {
7517 kernel = backend_ctx->kernel_upscale_bilinear;
7518 if (kernel == nullptr) {
7519 GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__);
7520 return;
7521 }
7522 } else {
7523 GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode);
7524 return;
7525 }
7526
7527 ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
7528 ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
7529
7530 cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
7531 cl_ulong off_dst = extra_dst->offset + dst->view_offs;
7532
7533 const cl_ulong nb00 = src0->nb[0];
7534 const cl_ulong nb01 = src0->nb[1];
7535 const cl_ulong nb02 = src0->nb[2];
7536 const cl_ulong nb03 = src0->nb[3];
7537
7538 const int ne00 = src0->ne[0];
7539 const int ne01 = src0->ne[1];
7540 const int ne02 = src0->ne[2];
7541 const int ne03 = src0->ne[3];
7542
7543 const int ne0 = dst->ne[0];
7544 const int ne1 = dst->ne[1];
7545 const int ne2 = dst->ne[2];
7546 const int ne3 = dst->ne[3];
7547
7548 float sf0 = (float)ne0 / ne00;
7549 float sf1 = (float)ne1 / ne01;
7550 float sf2 = (float)ne2 / ne02;
7551 float sf3 = (float)ne3 / ne03;
7552
7553 float pixel_offset = 0.5f;
7554
7555 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
7556 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
7557 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
7558 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
7559 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &nb00));
7560 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
7561 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb02));
7562 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
7563
7564 if (mode == GGML_SCALE_MODE_NEAREST) {
7565 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
7566 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne1));
7567 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne2));
7568 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne3));
7569 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
7570 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
7571 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
7572 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
7573 } else if (mode == GGML_SCALE_MODE_BILINEAR) {
7574 if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
7575 sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
7576 sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
7577 pixel_offset = 0.0f;
7578 }
7579
7580 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
7581 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
7582 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne0));
7583 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne1));
7584 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne2));
7585 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne3));
7586 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
7587 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
7588 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
7589 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
7590 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &pixel_offset));
7591 }
7592
7593
7594 size_t dst_total_elements = (size_t)ne0 * ne1 * ne2 * ne3;
7595 if (dst_total_elements == 0) {
7596 return;
7597 }
7598 size_t global_work_size[] = { dst_total_elements, 1, 1 };
7599 size_t local_work_size_pref = 256;
7600 size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1};
7601
7602 size_t * local_work_size_ptr = local_work_size;
7603 if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) {
7604 local_work_size_ptr = nullptr;
7605 }
7606
7607 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
7608}
7609
7610static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7611 GGML_ASSERT(src0);
7612 GGML_ASSERT(src0->extra);
7613 GGML_ASSERT(src1);
7614 GGML_ASSERT(src1->extra);
7615 GGML_ASSERT(dst);
7616 GGML_ASSERT(dst->extra);
7617 GGML_ASSERT(src0->type == GGML_TYPE_F32);
7618 GGML_ASSERT(src1->type == GGML_TYPE_F32);
7619 GGML_ASSERT(dst->type == GGML_TYPE_F32);
7620
7621 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7622
7623 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
7624 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
7625 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
7626
7627 cl_ulong offset0 = extra0->offset + src0->view_offs;
7628 cl_ulong offset1 = extra1->offset + src1->view_offs;
7629 cl_ulong offsetd = extrad->offset + dst->view_offs;
7630
7631 const int ne00 = src0->ne[0];
7632 const int ne01 = src0->ne[1];
7633 const int ne02 = src0->ne[2];
7634 const int ne03 = src0->ne[3];
7635
7636 const cl_ulong nb00 = src0->nb[0];
7637 const cl_ulong nb01 = src0->nb[1];
7638 const cl_ulong nb02 = src0->nb[2];
7639 const cl_ulong nb03 = src0->nb[3];
7640
7641 const cl_ulong nb10 = src1->nb[0];
7642 const cl_ulong nb11 = src1->nb[1];
7643 const cl_ulong nb12 = src1->nb[2];
7644 const cl_ulong nb13 = src1->nb[3];
7645
7646 const int ne0 = dst->ne[0];
7647 const int ne1 = dst->ne[1];
7648 const int ne2 = dst->ne[2];
7649 const int ne3 = dst->ne[3];
7650
7651 const cl_ulong nb0 = dst->nb[0];
7652 const cl_ulong nb1 = dst->nb[1];
7653 const cl_ulong nb2 = dst->nb[2];
7654 const cl_ulong nb3 = dst->nb[3];
7655
7656 const cl_int dim = ((const int32_t *) dst->op_params)[0];
7657 GGML_ASSERT(dim >= 0 && dim <= 3);
7658
7659 int nth = MIN(64, ne0);
7660
7661 cl_kernel kernel = backend_ctx->kernel_concat_f32;
7662
7663 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
7664 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
7665 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
7666 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
7667 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
7668 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
7669 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
7670 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
7671 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
7672 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03));
7673 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
7674 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
7675 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
7676 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
7677 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
7678 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
7679 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
7680 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
7681 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
7682 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
7683 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
7684 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
7685 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
7686 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int), &dim));
7687
7688 size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
7689 size_t local_work_size[] = {(size_t)nth, 1, 1};
7690
7691 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
7692}
7693
7694static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
7695 GGML_ASSERT(src0);
7696 GGML_ASSERT(src0->extra);
7697 GGML_ASSERT(dst);
7698 GGML_ASSERT(dst->extra);
7699 GGML_ASSERT(src0->type == GGML_TYPE_F32);
7700 GGML_ASSERT(dst->type == GGML_TYPE_F32);
7701
7702 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7703
7704 if (backend_ctx->kernel_timestep_embedding == nullptr) {
7705 GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
7706 return;
7707 }
7708
7709 ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
7710 ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
7711
7712 cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
7713 cl_ulong off_dst = extra_dst->offset + dst->view_offs;
7714
7715 const int logical_dim = dst->op_params[0];
7716 const int max_period = dst->op_params[1];
7717 const int dst_nb1_bytes = dst->nb[1];
7718
7719 cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
7720
7721 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
7722 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
7723 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
7724 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
7725 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes));
7726 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim));
7727 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period));
7728
7729 size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
7730
7731 size_t gws1 = (size_t)src0->ne[0];
7732
7733 size_t global_work_size[] = {gws0, gws1, 1};
7734
7735 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
7736}
7737
7738static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, const ggml_tensor * k, ggml_tensor * dst) {
7739 const ggml_tensor * v = dst->src[2];
7740 const ggml_tensor * mask = dst->src[3];
7741 const ggml_tensor * sinks = dst->src[4];
7742 GGML_ASSERT(q->extra);
7743 GGML_ASSERT(k->extra);
7744 GGML_ASSERT(v->extra);
7745 GGML_ASSERT(dst->extra);
7746 if (mask) {
7747 GGML_ASSERT(mask->extra);
7748 }
7749 if (sinks) {
7750 GGML_ASSERT(sinks->extra);
7751 }
7752
7753 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7754
7755 const int n_q = q->ne[1];
7756 const int n_kv = k->ne[1];
7757 const int d_head_q = q->ne[0];
7758 const int d_head_v = v->ne[0];
7759 const int n_head = q->ne[2];
7760 const int n_head_kv = k->ne[2];
7761 const int n_batch = q->ne[3];
7762
7763 cl_kernel kernel = NULL;
7764
7765 const bool is_f16 = q->type == GGML_TYPE_F16;
7766 const bool is_mixed = q->type == GGML_TYPE_F32 && k->type == GGML_TYPE_F16;
7767 const std::pair<int, int> dk_dv = {d_head_q, d_head_v};
7768
7769 if (n_q == 1) {
7770 if (is_mixed) {
7771 kernel = backend_ctx->kernels_flash_attn_f32_f16_q1.at(dk_dv);
7772 } else if (is_f16) {
7773 kernel = backend_ctx->kernels_flash_attn_f16_q1.at(dk_dv);
7774 } else {
7775 kernel = backend_ctx->kernels_flash_attn_f32_q1.at(dk_dv);
7776 }
7777 } else {
7778 if (is_mixed) {
7779 kernel = backend_ctx->kernels_flash_attn_f32_f16.at(dk_dv);
7780 } else if (is_f16) {
7781 kernel = backend_ctx->kernels_flash_attn_f16.at(dk_dv);
7782 } else {
7783 kernel = backend_ctx->kernels_flash_attn_f32.at(dk_dv);
7784 }
7785 }
7786 GGML_ASSERT(kernel != NULL);
7787
7788 ggml_tensor_extra_cl * extra_q = (ggml_tensor_extra_cl *)q->extra;
7789 ggml_tensor_extra_cl * extra_k = (ggml_tensor_extra_cl *)k->extra;
7790 ggml_tensor_extra_cl * extra_v = (ggml_tensor_extra_cl *)v->extra;
7791 ggml_tensor_extra_cl * extra_o = (ggml_tensor_extra_cl *)dst->extra;
7792 ggml_tensor_extra_cl * extra_mask = mask ? (ggml_tensor_extra_cl *)mask->extra : NULL;
7793 ggml_tensor_extra_cl * extra_sinks = sinks ? (ggml_tensor_extra_cl *)sinks->extra : NULL;
7794
7795 cl_ulong offset_q = extra_q->offset + q->view_offs;
7796 cl_ulong offset_k = extra_k->offset + k->view_offs;
7797 cl_ulong offset_v = extra_v->offset + v->view_offs;
7798 cl_ulong offset_o = extra_o->offset + dst->view_offs;
7799 cl_mem mask_buffer = extra_mask ? extra_mask->data_device : NULL;
7800 cl_ulong offset_mask = extra_mask ? extra_mask->offset + mask->view_offs : 0;
7801 cl_mem sinks_buffer = extra_sinks ? extra_sinks->data_device : NULL;
7802 cl_ulong offset_sinks = extra_sinks ? extra_sinks->offset + sinks->view_offs : 0;
7803
7804 const cl_ulong q_nb1 = q->nb[1], q_nb2 = q->nb[2], q_nb3 = q->nb[3];
7805 const cl_ulong k_nb1 = k->nb[1], k_nb2 = k->nb[2], k_nb3 = k->nb[3];
7806 const cl_ulong v_nb1 = v->nb[1], v_nb2 = v->nb[2], v_nb3 = v->nb[3];
7807 const cl_ulong o_nb1 = dst->nb[1], o_nb2 = dst->nb[2], o_nb3 = dst->nb[3];
7808 const cl_ulong mask_nb1 = mask ? mask->nb[1] : 0;
7809 const cl_ulong mask_nb2 = mask ? mask->nb[2] : 0;
7810 const cl_ulong mask_nb3 = mask ? mask->nb[3] : 0;
7811 const int mask_ne2 = mask ? mask->ne[2] : 0;
7812 const int mask_ne3 = mask ? mask->ne[3] : 0;
7813
7814 float scale, max_bias, logit_softcap;
7815 const float * params = (const float *)dst->op_params;
7816 scale = params[0];
7817 max_bias = params[1];
7818 logit_softcap = params[2];
7819
7820 const int is_causal = (mask == NULL && n_q > 1 && n_q == n_kv);
7821
7822 const int n_head_log2_val = n_head > 0 ? 1u << (int)floorf(log2f((float)n_head)) : 0;
7823 const float n_head_log2_f = n_head_log2_val > 0 ? (float)n_head_log2_val : 1.0f;
7824 const float m0 = powf(2.0f, -(max_bias) / n_head_log2_f);
7825 const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2_f);
7826
7827 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_q->data_device));
7828 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset_q));
7829 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_k->data_device));
7830 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset_k));
7831 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra_v->data_device));
7832 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset_v));
7833 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extra_o->data_device));
7834 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offset_o));
7835 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(float), &scale));
7836 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &n_q));
7837 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &n_kv));
7838 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &is_causal));
7839 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &n_head));
7840 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &q_nb1)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &q_nb2)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &q_nb3));
7841 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &k_nb1)); CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &k_nb2)); CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &k_nb3));
7842 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &v_nb1)); CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &v_nb2)); CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &v_nb3));
7843 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &o_nb1)); CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &o_nb2)); CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &o_nb3));
7844 CL_CHECK(clSetKernelArg(kernel, 25, sizeof(float), &max_bias));
7845 CL_CHECK(clSetKernelArg(kernel, 26, sizeof(float), &m0));
7846 CL_CHECK(clSetKernelArg(kernel, 27, sizeof(float), &m1));
7847 CL_CHECK(clSetKernelArg(kernel, 28, sizeof(int), &n_head_log2_val));
7848 CL_CHECK(clSetKernelArg(kernel, 29, sizeof(float), &logit_softcap));
7849 CL_CHECK(clSetKernelArg(kernel, 30, sizeof(int), &n_head_kv));
7850 CL_CHECK(clSetKernelArg(kernel, 31, sizeof(cl_mem), &mask_buffer));
7851 CL_CHECK(clSetKernelArg(kernel, 32, sizeof(cl_ulong), &offset_mask));
7852 CL_CHECK(clSetKernelArg(kernel, 33, sizeof(cl_ulong), &mask_nb1));
7853 CL_CHECK(clSetKernelArg(kernel, 34, sizeof(cl_ulong), &mask_nb2));
7854 CL_CHECK(clSetKernelArg(kernel, 35, sizeof(cl_ulong), &mask_nb3));
7855 CL_CHECK(clSetKernelArg(kernel, 36, sizeof(int), &mask_ne2));
7856 CL_CHECK(clSetKernelArg(kernel, 37, sizeof(int), &mask_ne3));
7857 CL_CHECK(clSetKernelArg(kernel, 38, sizeof(cl_mem), &sinks_buffer));
7858 CL_CHECK(clSetKernelArg(kernel, 39, sizeof(cl_ulong), &offset_sinks));
7859
7860 if (n_q == 1) {
7861 const size_t wg_size = 64;
7862 size_t local_work_size[] = { wg_size, 1 };
7863 size_t global_work_size[] = { wg_size, (size_t)(n_head * n_batch) };
7864 backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
7865 } else {
7866 const int block_m = backend_ctx->kernels_flash_attn_bm.at(dk_dv);
7867 const size_t wg_size = block_m;
7868 size_t local_work_size[] = { wg_size, 1 };
7869 size_t global_work_size[] = { (size_t)((n_q + block_m - 1) / block_m) * wg_size, (size_t)(n_head * n_batch) };
7870 backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
7871 }
7872}
7873
7874static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7875 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7876
7877 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
7878 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
7879 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
7880
7881 cl_ulong offset0 = extra0->offset + src0->view_offs;
7882 cl_ulong offset1 = extra1->offset + src1->view_offs;
7883 cl_ulong offsetd = extrad->offset + dst->view_offs;
7884
7885 const int M = src0->ne[1];
7886 const int N = src1->ne[1];
7887 const int K = src0->ne[0];
7888
7889 cl_kernel kernel = backend_ctx->kernel_mul_mat_f16_f32_tiled;
7890
7891 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int), &M));
7892 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &N));
7893 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &K));
7894 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0->data_device));
7895 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset0));
7896 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra1->data_device));
7897 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offset1));
7898 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem), &extrad->data_device));
7899 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
7900
7901 // Tiling parameters. These need to be tuned for optimal performance.
7902 // They must match the #defines in the kernel mul_mat_f16_f32.cl.
7903 //
7904 // OPWM / OPWN: Output tile size per Work-Group. A work-group computes a tile of size OPWM x OPWN.
7905 // TPWM / TPWN: Threads per Work-group. This is the work-group size.
7906 // OPTM / OPTN: Output elements per Thread. Each thread computes OPTM x OPTN elements.
7907 //
7908 // The following relationships must hold:
7909 // OPWM = TPWM * OPTM
7910 // OPWN = TPWN * OPTN
7911 //
7912 const int OPWM = 64;
7913 const int OPWN = 64;
7914 const int TPWM = 16;
7915 const int TPWN = 8;
7916
7917 size_t local_work_size[2] = { TPWM, TPWN };
7918 size_t global_work_size[2] = {
7919 (size_t) ((M + OPWM - 1) / OPWM) * TPWM,
7920 (size_t) ((N + OPWN - 1) / OPWN) * TPWN,
7921 };
7922
7923 backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
7924}
7925
7926static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7927 GGML_TENSOR_BINARY_OP_LOCALS;
7928 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
7929
7930 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
7931 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
7932 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
7933
7934 cl_ulong offset0 = extra0->offset + src0->view_offs;
7935 cl_ulong offset1 = extra1->offset + src1->view_offs;
7936 cl_ulong offsetd = extrad->offset + dst->view_offs;
7937
7938 const cl_uint Cout = ne03; const cl_uint Cin = ne02; const cl_uint N = ne13;
7939 const cl_uint KW = ne00; const cl_uint KH = ne01; const cl_uint W = ne10; const cl_uint H = ne11; const cl_uint OW = ne0; const cl_uint OH = ne1;
7940
7941 const cl_uint s0 = dst->op_params[0]; const cl_uint s1 = dst->op_params[1];
7942 const cl_uint p0 = dst->op_params[2]; const cl_uint p1 = dst->op_params[3];
7943 const cl_uint d0 = dst->op_params[4]; const cl_uint d1 = dst->op_params[5];
7944
7945 const cl_uint cl_nb01 = nb01/ggml_type_size(src0->type); const cl_uint cl_nb02 = nb02/ggml_type_size(src0->type); const cl_uint cl_nb03 = nb03/ggml_type_size(src0->type);
7946 const cl_uint cl_nb11 = nb11/ggml_type_size(src1->type); const cl_uint cl_nb12 = nb12/ggml_type_size(src1->type); const cl_uint cl_nb13 = nb13/ggml_type_size(src1->type);
7947 const cl_uint cl_nb1 = nb1/ggml_type_size(dst->type); const cl_uint cl_nb2 = nb2/ggml_type_size(dst->type); const cl_uint cl_nb3 = nb3/ggml_type_size(dst->type);
7948
7949 const int64_t NPQ = (int64_t)N * OW * OH;
7950
7951 const uint32_t BS_K = 64;
7952 const uint32_t BS_NPQ = 64;
7953 const uint32_t BS_CRS = 16;
7954 const uint32_t VEC_SIZE = 4;
7955
7956 const uint32_t TS_K = 4;
7957 const uint32_t TS_NPQ = 8;
7958
7959 const uint32_t WG_K = BS_K / TS_K;
7960 const uint32_t WG_NPQ = BS_NPQ / TS_NPQ;
7961
7962 auto splitWork = [](uint32_t work_size, uint32_t block_size) { return (block_size + work_size - 1) / block_size; };
7963 const uint32_t NB_K = splitWork(Cout, BS_K);
7964 const uint32_t NB_NPQ = splitWork(NPQ, BS_NPQ);
7965
7966 cl_kernel kernel;
7967 size_t shmem_size;
7968
7969 if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
7970 kernel = backend_ctx->kernel_conv_2d_f16;
7971 shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_half) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_half4));
7972 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
7973 kernel = backend_ctx->kernel_conv_2d_f32;
7974 shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_float) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_float4));
7975 } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
7976 kernel = backend_ctx->kernel_conv_2d_f16_f32;
7977 shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_half) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_float4));
7978 } else {
7979 GGML_ASSERT(false && "Unsupported data type combination for conv2d");
7980 }
7981
7982 cl_uint idx = 0;
7983 CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra0->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offset0));
7984 CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra1->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offset1));
7985 CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offsetd));
7986 CL_CHECK(clSetKernelArg(kernel, idx++, shmem_size, NULL));
7987 CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &Cout)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &Cin)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &N));
7988 CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &KW)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &KH)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &W)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H));
7989 CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &OW)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &OH));
7990 CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s1)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &p0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &p1));
7991 CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &d0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &d1));
7992 CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb01)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb02)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb03));
7993 CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb11)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb12)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb13));
7994 CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb1)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb2)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb3));
7995
7996 size_t global_work_size[] = { (size_t)NB_K * WG_K, (size_t)NB_NPQ * WG_NPQ, 1 };
7997 size_t local_work_size[] = { (size_t)WG_K, (size_t)WG_NPQ, 1 };
7998
7999 backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
8000}
8001
8002static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8003 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
8004
8005 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
8006 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
8007 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
8008
8009 const int ne00 = src0->ne[0];
8010 const int ne01 = src0->ne[1];
8011 const int ne02 = src0->ne[2];
8012
8013 const cl_ulong nb01 = src0->nb[1];
8014 const cl_ulong nb02 = src0->nb[2];
8015
8016 const int ne10 = src1->ne[0];
8017 const int ne11 = src1->ne[1];
8018 const int ne12 = src1->ne[2];
8019
8020 const cl_ulong nb10 = src1->nb[0];
8021
8022 const int ne0 = dst->ne[0];
8023 const int ne1 = dst->ne[1];
8024
8025 GGML_ASSERT(ne00 == ne10);
8026
8027 cl_kernel kernel;
8028 cl_context context = backend_ctx->context;
8029
8030 cl_int status;
8031 cl_image_format img_fmt_1d;
8032 cl_image_desc img_desc_1d;
8033 cl_buffer_region region;
8034 cl_mem A_image1d;
8035 cl_mem A_sub_buffer;
8036 cl_mem B_sub_buffer;
8037 cl_mem D_image1d;
8038 cl_mem D_sub_buffer;
8039
8040 int M = ne01;
8041 int N = ne1;
8042 int K = ne00;
8043
8044 if (nb01 > nb02) {
8045 // KQ
8046 kernel = backend_ctx->kernel_mul_mm_f16_f32_kq;
8047 } else {
8048 // KQV
8049 kernel = backend_ctx->kernel_mul_mm_f16_f32_kqv;
8050 }
8051 // create sub-buffer for A
8052 // <--------------------------------------------> //
8053 extra0 = src0->view_src ? (ggml_tensor_extra_cl *)src0->view_src->extra : (ggml_tensor_extra_cl *)src0->extra;
8054
8055 region.origin = (extra0->offset);
8056 if (nb01 > nb02) {
8057 // KQ
8058 region.size = nb01 * ne01;
8059 } else {
8060 // KQV
8061 region.size = nb02 * ne02;
8062 }
8063
8064 A_sub_buffer = clCreateSubBuffer((extra0->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
8065 CL_CHECK(status);
8066
8067 // <--------------------------------------------> //
8068
8069 // create sub-buffer for B
8070 // <--------------------------------------------> //
8071 region.origin = (extra1->offset);
8072 region.size = nb10 * ne10 * ne11 * ne12;
8073 B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
8074 CL_CHECK(status);
8075 // <--------------------------------------------> //
8076
8077 img_fmt_1d = {CL_RGBA, CL_FLOAT};
8078 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
8079 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8080 if (nb01 > nb02) {
8081 img_desc_1d.image_width = (nb01 * ne01 / 4)/4;
8082 }
8083 else {
8084 img_desc_1d.image_width = (nb02 * ne02 / 4)/4;
8085 }
8086 img_desc_1d.buffer = A_sub_buffer;
8087 A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
8088 CL_CHECK(status);
8089
8090 // create sub-buffer for output C
8091 // <--------------------------------------------> //
8092 region.origin = (extrad->offset);
8093 region.size = ne0 * ne1 * dst->ne[2] * dst->nb[0]; // size of C in bytes
8094 D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
8095 CL_CHECK(status);
8096 // <--------------------------------------------> //
8097
8098 // create image for C output
8099 // <--------------------------------------------> //
8100 img_fmt_1d = {CL_R, CL_FLOAT};
8101 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
8102 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8103 img_desc_1d.image_width = ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4;
8104 img_desc_1d.buffer = D_sub_buffer;
8105 D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
8106 CL_CHECK(status);
8107 // <--------------------------------------------> //
8108
8109 int offset_src0 = 0;
8110 int offset_src1 = 0;
8111
8112 // set kernel args
8113 // <--------------------------------------------> //
8114 cl_uint k_arg = 0;
8115 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
8116 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src0));
8117 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_sub_buffer));
8118 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src1));
8119 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &D_image1d));
8120 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset));
8121 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &M));
8122 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &K));
8123 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &N));
8124 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
8125 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
8126 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &nb01));
8127
8128 size_t global_work_size[3] = {64, static_cast<size_t>(((M+63)/64)), static_cast<size_t>(((N+31)/32)*ne12)};
8129 size_t local_work_size[3] = {64, 1, 2};
8130
8131 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8132
8133 // deallocate sub buffers and images
8134 // <--------------------------------------------> //
8135 CL_CHECK(clReleaseMemObject(A_image1d));
8136 CL_CHECK(clReleaseMemObject(D_image1d));
8137 CL_CHECK(clReleaseMemObject(A_sub_buffer));
8138 CL_CHECK(clReleaseMemObject(B_sub_buffer));
8139 CL_CHECK(clReleaseMemObject(D_sub_buffer));
8140}
8141
8142static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8143#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
8144 GGML_ASSERT(src0);
8145 GGML_ASSERT(src0->extra);
8146 GGML_ASSERT(src1);
8147 GGML_ASSERT(src1->extra);
8148 GGML_ASSERT(dst);
8149 GGML_ASSERT(dst->extra);
8150
8151 const enum ggml_type src0t = src0->type;
8152 const enum ggml_type src1t = src1->type;
8153
8154 GGML_ASSERT(src0t == GGML_TYPE_Q8_0);
8155 GGML_ASSERT(src1t == GGML_TYPE_F32);
8156
8157 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
8158
8159 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
8160 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
8161
8162 ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
8163
8164 GGML_ASSERT(src1->view_offs == 0);
8165 GGML_ASSERT(dst->view_offs == 0);
8166
8167 const int ne00 = src0->ne[0];
8168 const int ne01 = src0->ne[1];
8169 const int ne02 = src0->ne[2];
8170
8171 const int ne10 = src1->ne[0];
8172 const int ne12 = src1->ne[2];
8173
8174 const int ne0 = dst->ne[0];
8175 const int ne1 = dst->ne[1];
8176
8177 GGML_ASSERT(ne00 == ne10);
8178 GGML_ASSERT((ne00 % 32) == 0);
8179 GGML_ASSERT(ne0 == ne01);
8180
8181 cl_context context = backend_ctx->context;
8182 cl_kernel kernel;
8183
8184 // init CL objects
8185 cl_int status;
8186 cl_image_format img_fmt_1d;
8187 cl_image_desc img_desc_1d;
8188 cl_buffer_region region;
8189 cl_mem A_image1d;
8190 cl_mem B_image1d;
8191 cl_mem B_sub_buffer;
8192 cl_mem S_image1d;
8193
8194 cl_mem D_image1d;
8195 cl_mem D_sub_buffer;
8196
8197 int M = ne01;
8198 int N = ne1;
8199 int K = ne00;
8200
8201 // create an image for A
8202 img_fmt_1d = { CL_R, CL_FLOAT};
8203 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
8204 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8205 img_desc_1d.image_width = M * K / 4; // Divide by 4 for char -> float
8206 img_desc_1d.buffer = extra0_q8_0->q;
8207 A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
8208 CL_CHECK(status);
8209
8210 // create an image for Scale
8211 img_fmt_1d = { CL_R, CL_HALF_FLOAT};
8212 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
8213 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8214 img_desc_1d.image_width = M * K / 32; // Block size is 32
8215 img_desc_1d.buffer = extra0_q8_0->d;
8216 S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
8217 CL_CHECK(status);
8218
8219 // create a sub_buffer for B
8220 region.origin = (extra1->offset); // + src1->view_offs);
8221 region.size = K * N * sizeof(float);
8222 B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
8223 CL_CHECK(status);
8224
8225 // create an image for B from sub_buffer: RGBA (OCL)
8226 img_fmt_1d = {CL_RGBA, CL_FLOAT};
8227 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
8228 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8229 img_desc_1d.image_width = K * N / 4;
8230 img_desc_1d.buffer = B_sub_buffer;
8231 B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
8232 CL_CHECK(status);
8233
8234 // Create subbuffer and image1d_buffer for dst
8235 region.origin = (extrad->offset); // + dst->view_offs;
8236 region.size = M * N * sizeof(float);
8237 D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
8238 CL_CHECK(status);
8239
8240 img_fmt_1d = {CL_R, CL_FLOAT};
8241 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
8242 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8243 img_desc_1d.image_width = M * N;
8244 img_desc_1d.buffer = D_sub_buffer;
8245 D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
8246 CL_CHECK(status);
8247
8248 size_t local_work_size[3] = {1, 1, 1};
8249 size_t global_work_size[3] = {1, 1, 1};
8250
8251 if (N == 1) {
8252 kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32;
8253
8254 int r2 = 1;
8255 int r3 = 1;
8256 cl_uint k_arg = 0;
8257
8258 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
8259 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q8_0->d));
8260 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
8261 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
8262 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
8263 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
8264 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
8265 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
8266 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
8267 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10));
8268 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
8269 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0));
8270 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1));
8271 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2));
8272 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3));
8273
8274 size_t wavesize = backend_ctx->adreno_wave_size;
8275 local_work_size[0] = wavesize;
8276 local_work_size[1] = 4; // reduce factor
8277 local_work_size[2] = 1;
8278
8279 global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize;
8280 global_work_size[1] = 4; // reduce factor
8281 global_work_size[2] = 1;
8282 } else {
8283 cl_ulong offsetd = extrad->offset + dst->view_offs;
8284 cl_mem B_image1d_trans = nullptr;
8285 // for B transpose
8286 cl_mem B_d = nullptr;
8287 int padding;
8288
8289 //how many extra elements beyond multiple of 8
8290 int extra_elements = N % 8;
8291
8292 //how much padding to add
8293 padding = 0;
8294 if (extra_elements > 0){
8295 padding = 8 - extra_elements;
8296 }
8297
8298 // Specify the starting offset (in bytes)
8299 region.origin = 0;
8300 // Specify the size of the sub-buffer (divide by 2 for FP16)
8301 region.size = K * (N + padding) * sizeof(float)/2;
8302 backend_ctx->prealloc_act_trans.allocate(context, region.size);
8303 B_d = clCreateSubBuffer(
8304 backend_ctx->prealloc_act_trans.buffer,
8305 0,
8306 CL_BUFFER_CREATE_TYPE_REGION,
8307 ®ion,
8308 &status);
8309 CL_CHECK(status);
8310
8311 cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
8312 cl_image_desc image_desc_B_d_output = {
8313 CL_MEM_OBJECT_IMAGE1D_BUFFER,
8314 static_cast<size_t>(K * (N + padding)/4),
8315 0, 0, 0, 0, 0, 0, 0, { B_d }
8316 };
8317 B_image1d_trans = clCreateImage(
8318 context,
8319 0,
8320 &image_format_B_d_output,
8321 &image_desc_B_d_output,
8322 NULL,
8323 &status);
8324 CL_CHECK(status);
8325
8326 int height_B = N/4;
8327 if (height_B == 0) {
8328 height_B = 1;
8329 }
8330 int width_B = K/4;
8331 int padded_height_B = (N + padding)/4;
8332
8333 kernel = backend_ctx->kernel_transpose_32_16;
8334 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d));
8335 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans));
8336 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
8337 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
8338 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
8339
8340 size_t local_size_t[2] = { 1, 16 };
8341 size_t global_size_t[2] = {
8342 static_cast<size_t>(width_B),
8343 static_cast<size_t>(padded_height_B)
8344 };
8345
8346 backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
8347
8348 kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4;
8349
8350 int N_with_padding = N + padding;
8351
8352 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
8353 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
8354 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d_trans));
8355 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device));
8356 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &K));
8357 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &M));
8358 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &N_with_padding));
8359 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &N));
8360 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
8361
8362 global_work_size[0] = (size_t)(N + 7) / 8;
8363 global_work_size[1] = (size_t)(M + 3) / 4;
8364 global_work_size[2] = 1;
8365
8366 local_work_size[0] = 2;
8367 local_work_size[1] = 128;
8368 local_work_size[2] = 1;
8369 }
8370
8371 // enqueue kernel with profiling
8372 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8373
8374 // deallocate sub buffers and images
8375 CL_CHECK(clReleaseMemObject(A_image1d));
8376 CL_CHECK(clReleaseMemObject(B_sub_buffer));
8377 CL_CHECK(clReleaseMemObject(B_image1d));
8378 CL_CHECK(clReleaseMemObject(S_image1d));
8379 CL_CHECK(clReleaseMemObject(D_sub_buffer));
8380 CL_CHECK(clReleaseMemObject(D_image1d));
8381#else
8382 GGML_UNUSED(backend);
8383 GGML_UNUSED(src0);
8384 GGML_UNUSED(src1);
8385 GGML_UNUSED(dst);
8386#endif
8387}
8388
8389static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
8390 GGML_ASSERT(src0);
8391 GGML_ASSERT(src0->extra);
8392 GGML_ASSERT(src1);
8393 GGML_ASSERT(src1->extra);
8394 GGML_ASSERT(dst);
8395 GGML_ASSERT(dst->extra);
8396
8397 const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
8398 const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
8399
8400 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
8401
8402 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
8403 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
8404 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
8405
8406 cl_ulong offset0 = extra0->offset + src0->view_offs;
8407 cl_ulong offset1 = extra1->offset + src1->view_offs;
8408 cl_ulong offsetd = extrad->offset + dst->view_offs;
8409
8410#ifdef GGML_OPENCL_SOA_Q
8411 ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
8412 ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
8413 ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
8414 ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
8415#endif
8416
8417 const int ne00 = src0 ? src0->ne[0] : 0;
8418 const int ne01 = src0 ? src0->ne[1] : 0;
8419 const int ne02 = src0 ? src0->ne[2] : 0;
8420 const int ne03 = src0 ? src0->ne[3] : 0;
8421
8422 const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
8423 const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
8424 const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
8425 const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
8426
8427 const int ne10 = src1 ? src1->ne[0] : 0;
8428 const int ne11 = src1 ? src1->ne[1] : 0;
8429 const int ne12 = src1 ? src1->ne[2] : 0;
8430 const int ne13 = src1 ? src1->ne[3] : 0;
8431
8432 const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
8433 const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
8434 const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
8435 const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
8436
8437 const int ne0 = dst ? dst->ne[0] : 0;
8438 const int ne1 = dst ? dst->ne[1] : 0;
8439
8440 int r2 = ne12/ne02;
8441 int r3 = ne13/ne03;
8442
8443 GGML_ASSERT(ne00 == ne10);
8444
8445 int nth0 = 32;
8446 int nth1 = 1;
8447 int nrows = 1;
8448 // The number of values produced by each subgroup
8449 int ndst = 4;
8450
8451 cl_kernel kernel;
8452
8453#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
8454 cl_context context = backend_ctx->context;
8455
8456 if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
8457 if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0 &&
8458 // dst is wrapped with image1d_buffer, the size limit applies, also src0
8459 (ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4 <= backend_ctx->image_max_buffer_size)) {
8460 // For KQ
8461 if (ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
8462 ((nb01 * ne01 / 4)/4 <= backend_ctx->image_max_buffer_size) &&
8463 nb00 <= nb02 &&
8464 nb02 <= nb01 &&
8465 nb01 <= nb03 &&
8466 nb10 <= nb12 &&
8467 nb12 <= nb11 &&
8468 nb11 <= nb13) {
8469 ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
8470 return;
8471 }
8472 // For KQV
8473 if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
8474 ((nb02 * ne02 / 4)/4 <= backend_ctx->image_max_buffer_size)) {
8475 ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
8476 return;
8477 }
8478 }
8479 }
8480
8481 if (ne01 && ne1 && use_adreno_kernels(backend_ctx, src0)) {
8482
8483 // init CL objects
8484 // <--------------------------------------------> //
8485 cl_int status;
8486 cl_image_format img_fmt_1d;
8487 cl_image_desc img_desc_1d;
8488 cl_buffer_region region;
8489 cl_mem A_image1d = nullptr;
8490 cl_mem B_image1d = nullptr;
8491 cl_mem B_sub_buffer = nullptr;
8492 cl_mem C_d = nullptr;
8493 // for B transpose
8494 cl_mem B_d = nullptr;
8495 cl_mem B_d_input_image = nullptr;
8496 // <--------------------------------------------> //
8497
8498 // define matrix dimensions
8499 // <--------------------------------------------> //
8500 int M = ne01;
8501 int N = ne1;
8502 int K = ne00;
8503 int padding;
8504 // <--------------------------------------------> //
8505
8506 // q8_0 x fp32
8507 if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 &&
8508 enable_adreno_trans_weight(backend_ctx, src0)) {
8509 ggml_cl_mul_mat_q8_0_f32_adreno(backend, src0, src1, dst);
8510 return;
8511 }
8512
8513 // q4_0 x fp32
8514 if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
8515 // TODO: remove duplicate definitions of image description + format -- move to top
8516
8517 // create an image for A
8518 // <--------------------------------------------> //
8519 if (N == 1) {
8520 img_fmt_1d = { CL_R, CL_UNSIGNED_INT32};
8521 } else {
8522 img_fmt_1d = { CL_R, CL_FLOAT};
8523 }
8524 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
8525 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8526 img_desc_1d.image_width = M * K / 2 / 4; // Divide by 4 for char -> float
8527 img_desc_1d.buffer = extra0_q4_0->q;
8528 A_image1d = clCreateImage(
8529 context,
8530 CL_MEM_READ_ONLY,
8531 &img_fmt_1d,
8532 &img_desc_1d,
8533 NULL,
8534 &status);
8535 CL_CHECK(status);
8536 // <--------------------------------------------> //
8537
8538
8539 // create a sub_buffer for B
8540 // <--------------------------------------------> //
8541 region.origin = (extra1->offset);
8542 region.size = K * N * sizeof(float);
8543 B_sub_buffer = clCreateSubBuffer(
8544 extra1->data_device,
8545 0,
8546 CL_BUFFER_CREATE_TYPE_REGION,
8547 ®ion,
8548 &status);
8549 CL_CHECK(status);
8550 // <--------------------------------------------> //
8551
8552 // transpose activation for Skyler's gemm
8553 if (N != 1) {
8554 //how many extra elements beyond multiple of 8
8555 int extra_elements = N % 8;
8556
8557 //how much padding to add
8558 padding = 0;
8559 if (extra_elements > 0){
8560 padding = 8 - extra_elements;
8561 }
8562
8563 // Specify the starting offset (in bytes)
8564 region.origin = 0;
8565 // Specify the size of the sub-buffer (divide by 2 for FP16)
8566 region.size = K * (N + padding) * sizeof(float)/2;
8567 backend_ctx->prealloc_act_trans.allocate(context, region.size);
8568
8569 B_d = clCreateSubBuffer(
8570 backend_ctx->prealloc_act_trans.buffer,
8571 0,
8572 CL_BUFFER_CREATE_TYPE_REGION,
8573 ®ion,
8574 &status);
8575 CL_CHECK(status);
8576
8577 cl_image_format image_format_B_d_input = { CL_RGBA, CL_FLOAT };
8578 cl_image_desc image_desc_B_d_input = {
8579 CL_MEM_OBJECT_IMAGE1D_BUFFER,
8580 static_cast<size_t>(K * N / 4),
8581 0, 0, 0, 0, 0, 0, 0, { B_sub_buffer }
8582 };
8583 B_d_input_image = clCreateImage(
8584 context,
8585 0,
8586 &image_format_B_d_input,
8587 &image_desc_B_d_input,
8588 NULL,
8589 &status);
8590 CL_CHECK(status);
8591
8592 cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
8593 cl_image_desc image_desc_B_d_output = {
8594 CL_MEM_OBJECT_IMAGE1D_BUFFER,
8595 static_cast<size_t>(K * (N + padding)/4),
8596 0, 0, 0, 0, 0, 0, 0, { B_d }
8597 };
8598 B_image1d = clCreateImage(
8599 context,
8600 0,
8601 &image_format_B_d_output,
8602 &image_desc_B_d_output,
8603 NULL,
8604 &status);
8605 CL_CHECK(status);
8606
8607 int height_B = N/4;
8608 if (height_B == 0) {
8609 height_B = 1;
8610 }
8611 int width_B = K/4;
8612 int padded_height_B = (N + padding)/4;
8613
8614 kernel = backend_ctx->kernel_transpose_32_16;
8615 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_d_input_image));
8616 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d));
8617 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B));
8618 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B));
8619 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B));
8620
8621 size_t local_size_t[2] = { 1, 16 };
8622 //WGS tuning
8623 if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
8624 local_size_t[0]=4;
8625 local_size_t[1]=8;
8626 } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
8627 local_size_t[0]=2;
8628 local_size_t[1]=8;
8629 } else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
8630 local_size_t[0]=1;
8631 local_size_t[1]=8;
8632 } else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
8633 local_size_t[0]=2;
8634 local_size_t[1]=8;
8635 }
8636
8637 size_t global_size_t[2] = {
8638 static_cast<size_t>(width_B),
8639 static_cast<size_t>(padded_height_B)
8640 };
8641
8642 backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
8643 } else {
8644 // no need to transpose B in other cases
8645 // create an image for B from sub_buffer
8646 // <--------------------------------------------> //
8647 img_fmt_1d = {CL_RGBA, CL_FLOAT};
8648
8649 memset(&img_desc_1d, 0, sizeof(img_desc_1d));
8650 img_desc_1d.image_width = K * N / 4;
8651 img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
8652 img_desc_1d.buffer = B_sub_buffer;
8653 B_image1d = clCreateImage(
8654 context,
8655 CL_MEM_READ_ONLY,
8656 &img_fmt_1d,
8657 &img_desc_1d,
8658 NULL,
8659 &status);
8660 CL_CHECK(status);
8661 // <--------------------------------------------> //
8662 }
8663
8664 // choose gemm or gemv kernel
8665 // <--------------------------------------------> //
8666 if (N == 1) {
8667 kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
8668 if (M == 4096 && K == 4096) {
8669 kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
8670 } else if (M == 4096 && K == 11008) {
8671 kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
8672 } else if (M == 11008 && K == 4096) {
8673 kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
8674 } else if (M == 32000 && K == 4096) {
8675 kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
8676 }
8677 } else {
8678 kernel = backend_ctx->CL_mul_mat_Ab_Bi_8x4;
8679 }
8680 // <--------------------------------------------> //
8681
8682 // set kernel args
8683 // <--------------------------------------------> //
8684 cl_uint k_arg = 0;
8685
8686 if (N == 1) {
8687 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
8688 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q4_0->d));
8689 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d));
8690 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset));
8691 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device));
8692 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset));
8693 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00));
8694 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01));
8695 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
8696 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10));
8697 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
8698 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0));
8699 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1));
8700 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2));
8701 CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3));
8702 } else {
8703 region.origin = extrad->offset; // Specify the starting offset (in bytes)
8704 region.size = M * N * sizeof(float); // Specify the size of the sub-buffer
8705 C_d = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
8706 CL_CHECK(status);
8707
8708 int padded_N = ne1 + padding;
8709
8710 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); //A_q_dextra0_q4_0->q
8711 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); //A_s_d
8712 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d)); //B_d
8713 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &C_d)); //C_d
8714 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01)); //M
8715 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &padded_N)); //N with padding
8716 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); //K
8717 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne1)); //N without padding
8718 }
8719 // <--------------------------------------------> //
8720
8721 // choose workgroup size
8722 // <--------------------------------------------> //
8723 size_t global_work_size[3] = {
8724 64, static_cast<size_t>((M+63)/64), static_cast<size_t>((N+31)/32)};
8725 size_t local_work_size[3] = {64, 2, 4};
8726
8727 global_work_size[0] = (size_t)(ceil((float)ne1/8));
8728 global_work_size[1] = (size_t)(ne01/4);
8729 global_work_size[2] = (size_t)(1);
8730
8731 local_work_size[0] = (size_t)(1); //4x32 for FP32
8732 local_work_size[1] = (size_t)(128);
8733 local_work_size[2] = (size_t)(1);
8734
8735 //WGS tuning
8736 if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
8737 local_work_size[0] = 1;
8738 local_work_size[1] = 128;
8739 } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
8740 local_work_size[0] = 2;
8741 local_work_size[1] = 64;
8742 } else if (ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
8743 local_work_size[0] = 2;
8744 local_work_size[1] = 64;
8745 } else if (ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
8746 local_work_size[0] = 2;
8747 local_work_size[1] = 64;
8748 }
8749
8750 if (N == 1) {
8751 size_t wavesize = backend_ctx->adreno_wave_size;
8752 local_work_size[0] = wavesize; // localsize
8753 local_work_size[1] = 4; // reduce factor
8754 local_work_size[2] = 1;
8755
8756 global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
8757 global_work_size[1] = 4; // reduce factor
8758 global_work_size[2] = 1;
8759 }
8760 // <--------------------------------------------> //
8761
8762 // enqueue kernel with profiling
8763 // <--------------------------------------------> //
8764 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8765 // <--------------------------------------------> //
8766
8767 // deallocate sub buffers and images
8768 // <--------------------------------------------> //
8769 CL_CHECK(clReleaseMemObject(A_image1d));
8770 CL_CHECK(clReleaseMemObject(B_sub_buffer));
8771 CL_CHECK(clReleaseMemObject(B_image1d));
8772
8773 if (N != 1) {
8774 CL_CHECK(clReleaseMemObject(B_d));
8775 CL_CHECK(clReleaseMemObject(B_d_input_image));
8776 CL_CHECK(clReleaseMemObject(C_d));
8777 }
8778 // <--------------------------------------------> //
8779
8780 return;
8781 }
8782 } // if (ne01 && ne1)
8783#endif // GGML_OPENCL_USE_ADRENO_KERNELS
8784
8785 // GEMM using local memory
8786 // Current BK = 16, so ne00 % 16 == 0
8787 if (src1t == GGML_TYPE_F32 &&
8788 ne00 % 16 == 0 &&
8789 ne11 > 1) {
8790 switch(src0t) {
8791 case GGML_TYPE_F32: {
8792 kernel = backend_ctx->kernel_mul_mm_f32_f32_l4_lm;
8793 nth0 = 128; // calculated as (BM*BN)/(TM*TN)
8794
8795 int batch_stride_a = ne00*ne01;
8796 int batch_stride_b = ne10*ne11;
8797 int batch_stride_d = ne0*ne1;
8798
8799 cl_mem mem_src0 = extra0->data_device;
8800 cl_mem mem_src1 = extra1->data_device;
8801
8802 cl_ulong nb00_cont = nb00;
8803 cl_ulong nb01_cont = nb01;
8804 cl_ulong nb02_cont = nb02;
8805 cl_ulong nb03_cont = nb03;
8806
8807 cl_ulong nb10_cont = nb10;
8808 cl_ulong nb11_cont = nb11;
8809 cl_ulong nb12_cont = nb12;
8810 cl_ulong nb13_cont = nb13;
8811
8812 cl_ulong offset0_cont = offset0;
8813 cl_ulong offset1_cont = offset1;
8814
8815 if (!ggml_is_contiguous(src0)) {
8816 backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
8817 ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
8818 nb00_cont, nb01_cont, nb02_cont, nb03_cont);
8819 mem_src0 = backend_ctx->prealloc_src0.buffer;
8820 offset0_cont = 0;
8821 }
8822
8823 if (!ggml_is_contiguous(src1)) {
8824 backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
8825 ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
8826 nb10_cont, nb11_cont, nb12_cont, nb13_cont);
8827 mem_src1 = backend_ctx->prealloc_src1.buffer;
8828 offset1_cont = 0;
8829 }
8830
8831 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
8832 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont));
8833 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
8834 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont));
8835 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
8836 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
8837 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
8838 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
8839 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
8840 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
8841 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
8842 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
8843 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
8844 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
8845 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
8846 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
8847 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
8848 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
8849 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
8850
8851 // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
8852 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
8853 size_t local_work_size[] = {(size_t)nth0, 1, 1};
8854
8855 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8856 return;
8857 }
8858 case GGML_TYPE_F16: {
8859 kernel = backend_ctx->kernel_mul_mm_f16_f32_l4_lm;
8860 nth0 = 128; // calculated as (BM*BN)/(TM*TN)
8861
8862 int batch_stride_a = ne00*ne01;
8863 int batch_stride_b = ne10*ne11;
8864 int batch_stride_d = ne0*ne1;
8865
8866 cl_mem mem_src0 = extra0->data_device;
8867 cl_mem mem_src1 = extra1->data_device;
8868
8869 cl_ulong nb00_cont = nb00;
8870 cl_ulong nb01_cont = nb01;
8871 cl_ulong nb02_cont = nb02;
8872 cl_ulong nb03_cont = nb03;
8873
8874 cl_ulong nb10_cont = nb10;
8875 cl_ulong nb11_cont = nb11;
8876 cl_ulong nb12_cont = nb12;
8877 cl_ulong nb13_cont = nb13;
8878
8879 cl_ulong offset0_cont = offset0;
8880 cl_ulong offset1_cont = offset1;
8881
8882 if (!ggml_is_contiguous(src0)) {
8883 backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
8884 ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
8885 nb00_cont, nb01_cont, nb02_cont, nb03_cont);
8886 mem_src0 = backend_ctx->prealloc_src0.buffer;
8887 offset0_cont = 0;
8888 }
8889
8890 if (!ggml_is_contiguous(src1)) {
8891 backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
8892 ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
8893 nb10_cont, nb11_cont, nb12_cont, nb13_cont);
8894 mem_src1 = backend_ctx->prealloc_src1.buffer;
8895 offset1_cont = 0;
8896 }
8897
8898 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0));
8899 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont));
8900 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1));
8901 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont));
8902 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
8903 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
8904 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
8905 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
8906 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
8907 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
8908 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
8909 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
8910 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
8911 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
8912 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
8913 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
8914 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
8915 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
8916 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
8917
8918 // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
8919 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
8920 size_t local_work_size[] = {(size_t)nth0, 1, 1};
8921
8922 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8923 return;
8924 }
8925 case GGML_TYPE_Q8_0: {
8926 if (ne11 < 32) {
8927 break;
8928 }
8929 if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
8930 break;
8931 }
8932
8933 kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm;
8934 nth0 = 128; // calculated as (BM*BN)/(TM*TN)
8935
8936 int batch_stride_a = ne00*ne01;
8937 int batch_stride_b = ne10*ne11;
8938 int batch_stride_d = ne0*ne1;
8939
8940 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
8941 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
8942 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
8943 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
8944 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
8945 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
8946 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
8947 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
8948 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
8949 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
8950 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
8951 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
8952 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
8953 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
8954 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
8955 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
8956 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
8957 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
8958 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
8959
8960 // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
8961 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
8962 size_t local_work_size[] = {(size_t)nth0, 1, 1};
8963
8964 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
8965 return;
8966 }
8967 case GGML_TYPE_Q6_K: {
8968 if (ne11 < 32) {
8969 break;
8970 }
8971 if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
8972 break;
8973 }
8974
8975 kernel = backend_ctx->kernel_mul_mm_q6_k_f32_l4_lm;
8976 nth0 = 128; // calculated as (BM*BN)/(TM*TN)
8977
8978 int batch_stride_a = ne00*ne01;
8979 int batch_stride_b = ne10*ne11;
8980 int batch_stride_d = ne0*ne1;
8981
8982 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q6_K->ql));
8983 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q6_K->qh));
8984 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q6_K->s));
8985 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q6_K->d));
8986 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
8987 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
8988 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
8989 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
8990 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
8991 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
8992 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
8993 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
8994 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
8995 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); // stride_a
8996 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); // stride_b
8997 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne01)); // stride_d
8998 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_a));
8999 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &batch_stride_b));
9000 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &batch_stride_d));
9001 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &r2));
9002 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &r3));
9003
9004 // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
9005 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
9006 size_t local_work_size[] = {(size_t)nth0, 1, 1};
9007
9008 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9009 return;
9010 }
9011 default:
9012 break;
9013 }
9014 }
9015
9016 if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
9017 src0->ne[1] > 32 && // M > 32
9018 src1->ne[1] > 32 && // N > 32
9019 src0->ne[0] > 32 && // K > 32
9020 src0->ne[2] == 1 && src0->ne[3] == 1 &&
9021 src1->ne[2] == 1 && src1->ne[3] == 1 &&
9022 ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
9023 backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
9024 ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
9025 return;
9026 }
9027
9028 if (!ggml_is_transposed(src0) &&
9029 !ggml_is_transposed(src1) &&
9030 src1t == GGML_TYPE_F32 &&
9031 ne00%32 == 0 &&
9032 ne11 > 2) {
9033#ifdef GGML_OPENCL_SOA_Q
9034 // Set up kernel.
9035 switch(src0t) {
9036 case GGML_TYPE_Q4_0:
9037 // This should have been satisfied.
9038 GGML_ASSERT(ne11 == ne1);
9039 GGML_ASSERT(ne01 == ne0);
9040
9041 if (backend_ctx->gpu_family == INTEL) {
9042 nth0 = 16;
9043 nth1 = 1;
9044
9045 kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat;
9046 } else if (backend_ctx->gpu_family == ADRENO) {
9047 nth0 = 64;
9048 nth1 = 1;
9049
9050 kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat;
9051 } else {
9052 GGML_ASSERT(false && "TODO: Unknown GPU");
9053 }
9054
9055 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
9056 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
9057 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9058 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9059 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9060 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9061 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9062 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
9063 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
9064 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
9065 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
9066 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
9067 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
9068 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
9069 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
9070 break;
9071 default:
9072 break;
9073 }
9074
9075 // Launch kernel.
9076 if (src0t == GGML_TYPE_Q4_0) {
9077 size_t global_work_size[] = {(size_t)(ne01 + 7)/8*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
9078 size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
9079
9080 if (backend_ctx->gpu_family == INTEL) {
9081 // Set global size for Intel. It uses 16x output values.
9082 global_work_size[0] = (size_t)(ne01 + 15)/16*nth0;
9083 global_work_size[1] = (size_t)ne11*nth1;
9084 global_work_size[2] = (size_t)ne12*ne13;
9085 }
9086
9087 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9088 return;
9089 }
9090#else // GGML_OPENCL_SOA_Q
9091 // TODO: add block_q4_0 variant.
9092#endif // GGML_OPENCL_SOA_Q
9093 }
9094
9095 // use custom matrix x vector kernel
9096 switch (src0t) {
9097 case GGML_TYPE_F32:
9098 //GGML_ASSERT(ne02 == ne12);
9099 GGML_ASSERT(src1t == GGML_TYPE_F32);
9100 kernel = backend_ctx->kernel_mul_mat_f32_f32;
9101 nrows = 4;
9102
9103 if (backend_ctx->gpu_family == INTEL) {
9104 nth0 = 32;
9105 nth1 = 1;
9106 } else if (backend_ctx->gpu_family == ADRENO) {
9107 nth0 = 64;
9108 nth1 = 1;
9109 } else {
9110 GGML_ASSERT(false && "TODO: Unknown GPU");
9111 }
9112
9113 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
9114 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
9115 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9116 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9117 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9118 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9119 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9120 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
9121 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
9122 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
9123 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
9124 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
9125 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
9126 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
9127 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
9128 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
9129 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
9130 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
9131 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
9132 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
9133 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
9134 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
9135 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
9136 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
9137 break;
9138 case GGML_TYPE_F16:
9139 //GGML_ASSERT(ne02 == ne12);
9140 if (backend_ctx->gpu_family == INTEL) {
9141 nth0 = 32;
9142 nth1 = 1;
9143 } else if (backend_ctx->gpu_family == ADRENO) {
9144 nth0 = 64;
9145 nth1 = 1;
9146 } else {
9147 GGML_ASSERT(false && "TODO: Unknown GPU");
9148 }
9149
9150 if (src1t == GGML_TYPE_F32) {
9151 if (ne11 * ne12 < 4) {
9152 kernel = backend_ctx->kernel_mul_mat_f16_f32_1row;
9153 } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
9154 kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
9155 nrows = ne11;
9156 } else {
9157 kernel = backend_ctx->kernel_mul_mat_f16_f32;
9158 nrows = 4;
9159 }
9160 } else {
9161 kernel = backend_ctx->kernel_mul_mat_f16_f16;
9162 nrows = 4;
9163 }
9164
9165 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
9166 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
9167 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9168 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9169 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9170 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9171 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9172 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
9173 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
9174 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00));
9175 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
9176 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
9177 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
9178 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
9179 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
9180 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
9181 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
9182 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
9183 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
9184 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
9185 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
9186 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
9187 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
9188 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
9189 break;
9190 case GGML_TYPE_Q4_0:
9191 // This should have been satisfied.
9192 GGML_ASSERT(ne11 == ne1);
9193 GGML_ASSERT(ne01 == ne0);
9194
9195#ifdef GGML_OPENCL_SOA_Q
9196 if (backend_ctx->gpu_family == INTEL) {
9197 nth0 = 16;
9198 nth1 = 1;
9199
9200 kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
9201 ndst = 8;
9202 } else if (backend_ctx->gpu_family == ADRENO) {
9203 nth0 = 64;
9204 nth1 = 1;
9205
9206 kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
9207 ndst =8;
9208 } else {
9209 GGML_ASSERT(false && "TODO: Unknown GPU");
9210 }
9211
9212 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
9213 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
9214 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9215 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9216 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9217 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9218 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9219 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
9220 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
9221 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
9222 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
9223 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
9224 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
9225 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
9226 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
9227#else // GGML_OPENCL_SOA_Q
9228 if (backend_ctx->gpu_family == INTEL) {
9229 // Use 1D local size. Each workgroup is a SIMD group. Each SIMD
9230 // group produces N_DST (4 for Q4_0 kernel) values in the result.
9231 // The number of workgroups on dim 0 (the leading dimension) is
9232 // the nearest multiple of 4 that covers ne0 (equals ne01).
9233 nth0 = 16;
9234 nth1 = 1;
9235
9236 kernel = backend_ctx->kernel_mul_mat_q4_0_f32;
9237 ndst = 4;
9238 } else if (backend_ctx->gpu_family == ADRENO) {
9239 nth0 = 64;
9240 nth1 = 1;
9241
9242 kernel = backend_ctx->kernel_mul_mat_q4_0_f32_v;
9243 ndst = 4;
9244 } else {
9245 GGML_ASSERT(false && "TODO: Unknown GPU");
9246 }
9247
9248 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
9249 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
9250 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9251 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9252 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9253 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9254 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9255 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
9256 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
9257 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
9258 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
9259 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
9260 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
9261 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
9262 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
9263#endif // GGML_OPENCL_SOA_Q
9264 break;
9265 case GGML_TYPE_Q4_1:
9266 case GGML_TYPE_Q8_0: {
9267#ifdef GGML_OPENCL_SOA_Q
9268 kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat;
9269
9270 // nth0 - subgroup size
9271 // nth1 - number of subgroups per workgroup
9272 // ndst - number of output values per workgroup = output per subgroup * number of subgroups
9273 if (backend_ctx->gpu_family == INTEL) {
9274 nth0 = 16;
9275 nth1 = 2;
9276 ndst = nth1*4;
9277 } else if (backend_ctx->gpu_family == ADRENO) {
9278 nth0 = 64;
9279 nth1 = 2;
9280 ndst = nth1*4;
9281 } else {
9282 GGML_ASSERT(false && "TODO: Unknown GPU");
9283 }
9284
9285 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
9286 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
9287 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9288 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9289 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9290 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9291 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9292 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
9293 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
9294 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
9295 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
9296 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
9297 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
9298 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
9299 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
9300 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
9301 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
9302 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
9303 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
9304#else
9305 kernel = backend_ctx->kernel_mul_mv_q8_0_f32;
9306
9307 // nth0 - subgroup size
9308 // nth1 - number of subgroups per workgroup
9309 // ndst - number of output values per workgroup = output per subgroup * number of subgroups
9310 if (backend_ctx->gpu_family == INTEL) {
9311 nth0 = 16;
9312 nth1 = 2;
9313 ndst = nth1*4;
9314 } else if (backend_ctx->gpu_family == ADRENO) {
9315 nth0 = 64;
9316 nth1 = 2;
9317 ndst = nth1*4;
9318 } else {
9319 GGML_ASSERT(false && "TODO: Unknown GPU");
9320 }
9321
9322 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
9323 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
9324 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9325 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9326 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9327 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9328 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9329 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
9330 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
9331 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
9332 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
9333 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
9334 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
9335 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
9336 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
9337 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
9338 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
9339 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
9340 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
9341#endif // GGML_OPENCL_SOA_Q
9342 break;
9343 }
9344 case GGML_TYPE_Q2_K:
9345 case GGML_TYPE_Q3_K:
9346 case GGML_TYPE_Q4_K: {
9347 kernel = backend_ctx->kernel_mul_mv_q4_K_f32;
9348
9349 if (backend_ctx->gpu_family == INTEL) {
9350 nth0 = 16;
9351 nth1 = 1;
9352 ndst = 4;
9353 } else if (backend_ctx->gpu_family == ADRENO) {
9354 nth0 = 64;
9355 nth1 = 1;
9356 ndst = 4;
9357 } else {
9358 GGML_ASSERT(false && "TODO: Unknown GPU");
9359 }
9360
9361 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
9362 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int), &offset0));
9363 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9364 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &offset1));
9365 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9366 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &offsetd));
9367 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9368 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
9369 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
9370 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
9371 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
9372 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
9373 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
9374 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
9375 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
9376 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
9377 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
9378 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
9379 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
9380 break;
9381 }
9382 case GGML_TYPE_Q5_K:
9383 case GGML_TYPE_Q6_K:
9384#ifdef GGML_OPENCL_SOA_Q
9385 kernel = backend_ctx->kernel_mul_mv_q6_K_f32_flat;
9386
9387 if (backend_ctx->gpu_family == INTEL) {
9388 nth0 = 16;
9389 nth1 = 2;
9390 ndst = 4;
9391 } else if (backend_ctx->gpu_family == ADRENO) {
9392 nth0 = 64;
9393 nth1 = 2;
9394 ndst = 4;
9395 } else {
9396 GGML_ASSERT(false && "TODO: Unknown GPU");
9397 }
9398
9399 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q6_K->ql));
9400 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q6_K->qh));
9401 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q6_K->s));
9402 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q6_K->d));
9403 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra1->data_device));
9404 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset1));
9405 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
9406 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
9407 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
9408 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
9409 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
9410 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10));
9411 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
9412 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne0));
9413 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne1));
9414 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &r2));
9415 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r3));
9416#else
9417 kernel = backend_ctx->kernel_mul_mv_q6_K_f32;
9418
9419 if (backend_ctx->gpu_family == INTEL) {
9420 nth0 = 16;
9421 nth1 = 2;
9422 ndst = 1;
9423 } else if (backend_ctx->gpu_family == ADRENO) {
9424 nth0 = 64;
9425 nth1 = 2;
9426 ndst = 1;
9427 } else {
9428 GGML_ASSERT(false && "TODO: Unknown GPU");
9429 }
9430
9431 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
9432 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
9433 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9434 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9435 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9436 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9437 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9438 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
9439 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
9440 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10));
9441 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
9442 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0));
9443 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1));
9444 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2));
9445 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
9446#endif // GGML_OPENCL_SOA_Q
9447 break;
9448 case GGML_TYPE_MXFP4: {
9449#ifdef GGML_OPENCL_SOA_Q
9450 kernel = backend_ctx->kernel_mul_mv_mxfp4_f32_flat;
9451
9452 cl_mem q;
9453 if (backend_ctx->gpu_family == INTEL) {
9454 nth0 = 16;
9455 nth1 = 2;
9456 ndst = nth1*2;
9457
9458 q = extra0_mxfp4->q;
9459 } else if (backend_ctx->gpu_family == ADRENO) {
9460 nth0 = 64;
9461 nth1 = 2;
9462 ndst = nth1*2;
9463
9464 q = extra0_mxfp4->q_img;
9465 } else {
9466 GGML_ASSERT(false && "TODO: Unknown GPU");
9467 }
9468
9469 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q));
9470 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_mxfp4->e));
9471 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9472 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9473 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9474 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9475 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9476 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
9477 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
9478 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
9479 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
9480 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
9481 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb12));
9482 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb13));
9483 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne0));
9484 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne1));
9485 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
9486 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
9487#else
9488 kernel = backend_ctx->kernel_mul_mv_mxfp4_f32;
9489
9490 if (backend_ctx->gpu_family == INTEL) {
9491 nth0 = 16;
9492 nth1 = 2;
9493 ndst = nth1*2;
9494 } else if (backend_ctx->gpu_family == ADRENO) {
9495 nth0 = 64;
9496 nth1 = 2;
9497 ndst = nth1*2;
9498 } else {
9499 GGML_ASSERT(false && "TODO: Unknown GPU");
9500 }
9501
9502 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
9503 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
9504 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9505 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9506 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
9507 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
9508 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
9509 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
9510 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
9511 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
9512 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
9513 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
9514 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb12));
9515 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb13));
9516 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne0));
9517 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne1));
9518 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
9519 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
9520 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float)*nth0,nullptr));
9521#endif
9522 break;
9523 }
9524 default:
9525 GGML_ASSERT(false && "not implemented");
9526 }
9527
9528 if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_MXFP4 ||
9529 src0t == GGML_TYPE_Q4_1 ||
9530 src0t == GGML_TYPE_Q8_0 ||
9531 src0t == GGML_TYPE_Q2_K) {
9532 // Each SIMD group produces N_DST values in the result. Assuming each
9533 // workgroup has N_SIMDGROUP SIMD groups, then each workgroup will
9534 // produce N_DST*N_SIMDGROUP values in the result. Hence, the grid size
9535 // (number of workgroups) will be a nearest multiple of
9536 // N_DST*N_SIMDGROUP to cover the size of the dimension. Below, 4 is
9537 // N_DST*N_SIMDGROUP (see the kernel for Q4_0 matmul).
9538 size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
9539 size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
9540
9541 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9542 } else if (src0t == GGML_TYPE_Q4_K) {
9543 size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
9544 size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
9545
9546 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9547 } else if (src0t == GGML_TYPE_Q3_K) {
9548 GGML_ASSERT(false && "not implemented");
9549 } else if (src0t == GGML_TYPE_Q5_K) {
9550 GGML_ASSERT(false && "not implemented");
9551 } else if (src0t == GGML_TYPE_Q6_K) {
9552 size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
9553 size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
9554
9555 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9556 } else {
9557 int64_t ny = (ne11 + nrows - 1)/nrows;
9558
9559 size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
9560 size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
9561
9562 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9563 }
9564}
9565
9566static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9567 GGML_ASSERT(src0);
9568 GGML_ASSERT(src0->extra);
9569 GGML_ASSERT(src1);
9570 GGML_ASSERT(src1->extra);
9571 GGML_ASSERT(dst);
9572 GGML_ASSERT(dst->extra);
9573
9574 const ggml_tensor * src2 = dst->src[2];
9575 GGML_ASSERT(src2);
9576 GGML_ASSERT(src2->extra);
9577
9578 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
9579
9580 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
9581 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
9582 ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
9583 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
9584
9585 cl_ulong offset0 = extra0->offset + src0->view_offs;
9586 cl_ulong offset1 = extra1->offset + src1->view_offs;
9587 cl_ulong offset2 = extra2->offset + src2->view_offs;
9588 cl_ulong offsetd = extrad->offset + dst->view_offs;
9589
9590 GGML_UNUSED(offset0);
9591
9592#ifdef GGML_OPENCL_SOA_Q
9593 ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
9594 ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
9595 ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
9596#endif
9597
9598 const int ne00 = src0->ne[0];
9599 const int ne01 = src0->ne[1];
9600 const int ne02 = src0->ne[2];
9601 const int ne03 = src0->ne[3];
9602
9603 const cl_ulong nb00 = src0->nb[0];
9604 const cl_ulong nb01 = src0->nb[1];
9605 const cl_ulong nb02 = src0->nb[2];
9606 const cl_ulong nb03 = src0->nb[3];
9607
9608 const int ne10 = src1->ne[0];
9609 const int ne11 = src1->ne[1];
9610 const int ne12 = src1->ne[2];
9611 const int ne13 = src1->ne[3];
9612
9613 const cl_ulong nb11 = src1->nb[1];
9614 const cl_ulong nb12 = src1->nb[2];
9615 const cl_ulong nb13 = src1->nb[3];
9616
9617 const int ne20 = src2->ne[0];
9618 const int ne21 = src2->ne[1];
9619
9620 const cl_ulong nb21 = src2->nb[1];
9621 const cl_ulong nb20 = src2->nb[0];
9622
9623 UNUSED(nb20);
9624
9625 const int ne0 = dst->ne[0];
9626 const int ne1 = dst->ne[1];
9627
9628 const int r2 = ne12/ne02;
9629 const int r3 = ne13/ne03;
9630 const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
9631
9632 GGML_ASSERT(ne00 == ne10);
9633
9634 int sgs = 32; // subgroup size
9635 int nsg = 1; // number of subgroups
9636 int nrows = 1; // number of row in src1
9637 int ndst = 4; // number of values produced by each subgroup
9638
9639 cl_kernel kernel;
9640
9641 // subgroup mat vec
9642 switch (src0->type) {
9643 case GGML_TYPE_Q4_0: {
9644 kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
9645
9646 if (backend_ctx->gpu_family == INTEL) {
9647 sgs = 16;
9648 nsg = 1;
9649 ndst = 8;
9650 } else if (backend_ctx->gpu_family == ADRENO) {
9651 sgs = 64;
9652 nsg = 1;
9653 ndst = 8;
9654 } else {
9655 GGML_ASSERT(false && "TODO: Unknown GPU");
9656 }
9657
9658 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
9659 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
9660 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9661 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9662 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
9663 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
9664 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
9665 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
9666 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
9667 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
9668 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
9669 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
9670 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
9671 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
9672 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
9673 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
9674 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
9675 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
9676 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
9677 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
9678 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
9679 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
9680 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
9681 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
9682 CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
9683
9684 break;
9685 }
9686 case GGML_TYPE_Q8_0: {
9687#ifdef GGML_OPENCL_SOA_Q
9688 kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32_flat;
9689
9690 if (backend_ctx->gpu_family == INTEL) {
9691 sgs = 16;
9692 nsg = 2;
9693 ndst = 4;
9694 } else if (backend_ctx->gpu_family == ADRENO) {
9695 sgs = 64;
9696 nsg = 2;
9697 ndst = 4;
9698 } else {
9699 GGML_ASSERT(false && "TODO: Unknown GPU");
9700 }
9701
9702 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
9703 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
9704 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9705 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9706 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
9707 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
9708 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
9709 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
9710 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
9711 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
9712 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
9713 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
9714 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
9715 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
9716 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
9717 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
9718 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne20));
9719 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne21));
9720 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21));
9721 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne0));
9722 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne1));
9723#else
9724 kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32;
9725
9726 if (backend_ctx->gpu_family == INTEL) {
9727 sgs = 16;
9728 nsg = 2;
9729 ndst = 4;
9730 } else if (backend_ctx->gpu_family == ADRENO) {
9731 sgs = 64;
9732 nsg = 2;
9733 ndst = 4;
9734 } else {
9735 GGML_ASSERT(false && "TODO: Unknown GPU");
9736 }
9737
9738 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
9739 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
9740 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9741 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9742 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
9743 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
9744 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
9745 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
9746 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
9747 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
9748 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
9749 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
9750 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
9751 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
9752 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
9753 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
9754 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne20));
9755 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne21));
9756 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21));
9757 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne0));
9758 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne1));
9759#endif // GGML_OPENCL_SOA_Q
9760 break;
9761 }
9762 case GGML_TYPE_MXFP4: {
9763#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
9764 if (use_adreno_moe_kernels(backend_ctx, src0)) {
9765 cl_int status;
9766
9767 size_t local_size[3] = {64, 2, 1};
9768 size_t global_size[3] = {64, 2, 1};
9769
9770 cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
9771
9772 int tile_size = 320;
9773 if (ne12 == 1) { // for gemv
9774 kernel = backend_ctx->kernel_gemv_moe_mxfp4_f32;
9775
9776 // create a sub_buffer for src2
9777 cl_buffer_region region;
9778 region.origin = offset2;
9779 region.size = ne20 * ne21 * sizeof(int);
9780 buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
9781 CL_CHECK(status);
9782
9783 // set thread grid
9784 global_size[0] = static_cast<size_t>(ne01);
9785 global_size[1] = 4;
9786 global_size[2] = static_cast<size_t>(ne20);
9787 local_size[1] = 4;
9788 } else { // for gemm
9789 kernel = backend_ctx->kernel_gemm_moe_mxfp4_f32;
9790
9791 // preprocess router table
9792 int num_tiles_per_expert = (ne01 + tile_size - 1) / tile_size;
9793 void * host_src2_reorder = malloc(ne20 * ne21 * 4 * num_tiles_per_expert * sizeof(short));
9794 void * host_src2 = malloc(ne21 * nb21);
9795 CL_CHECK(clEnqueueReadBuffer(backend_ctx->queue, extra2->data_device, CL_TRUE, offset2, ne21 * nb21, host_src2, 0, NULL, NULL));
9796 int total_experts = nb21 / nb20;
9797 int out_idx = 0;
9798 for (int i_expert = 0; i_expert < ne02; i_expert++) {
9799 for (int i_tile = 0; i_tile < num_tiles_per_expert; i_tile++) {
9800 for (int j = 0; j < ne21; j++) {
9801 for (int i = 0; i < ne20; i++) {
9802 int expert = ((int *)host_src2)[j * total_experts + i];
9803 if (i_expert == expert) {
9804 ((short *)host_src2_reorder)[out_idx] = static_cast<short>(expert);
9805 ((short *)host_src2_reorder)[out_idx + 1] = static_cast<short>(j * ne11 + (i % ne11));
9806 ((short *)host_src2_reorder)[out_idx + 2] = static_cast<short>(j * ne20 + i);
9807 ((short *)host_src2_reorder)[out_idx + 3] = static_cast<short>(i_tile);
9808 out_idx += 4;
9809 }
9810 }
9811 }
9812 }
9813 }
9814 buf_src2 = clCreateBuffer(backend_ctx->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ne20 * ne21 * 4 * num_tiles_per_expert * sizeof(short), host_src2_reorder, &status);
9815 CL_CHECK(status);
9816
9817 // set thread grid
9818 global_size[0] = static_cast<size_t>(tile_size);
9819 global_size[2] = static_cast<size_t>(ne20 * ne21 * num_tiles_per_expert);
9820 }
9821
9822 // create a sub_buffer for src1
9823 cl_buffer_region region;
9824 region.origin = offset1;
9825 region.size = ne10 * ne11 * ne12 * sizeof(float);
9826 src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
9827 CL_CHECK(status);
9828
9829 // create image for src1
9830 cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
9831 cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
9832 buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
9833 CL_CHECK(status);
9834
9835 // Set kernel args
9836 int arg_idx = 0;
9837 CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_mxfp4->q));
9838 CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_mxfp4->e));
9839 CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image));
9840 CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
9841 CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device));
9842 CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd));
9843 CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
9844 CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
9845 if (ne12 == 1) {
9846 CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11));
9847 } else {
9848 CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &tile_size));
9849 }
9850
9851 // launch kernel
9852 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
9853
9854 // deallocate sub buffers and images
9855 CL_CHECK(clReleaseMemObject(src1_sub_buffer));
9856 CL_CHECK(clReleaseMemObject(buf_src1_image));
9857 CL_CHECK(clReleaseMemObject(buf_src2));
9858 return;
9859 } // else fallback to generic kernel
9860#endif // GGML_OPENCL_USE_ADRENO_KERNELS
9861
9862#ifdef GGML_OPENCL_SOA_Q
9863 kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat;
9864
9865 cl_mem q;
9866 if (backend_ctx->gpu_family == INTEL) {
9867 sgs = 16;
9868 nsg = 2;
9869 ndst = 2;
9870
9871 q = extra0_mxfp4->q;
9872 } else if (backend_ctx->gpu_family == ADRENO) {
9873 sgs = 64;
9874 nsg = 1;
9875 ndst = 4;
9876
9877 q = extra0_mxfp4->q_img;
9878 } else {
9879 GGML_ASSERT(false && "TODO: Unknown GPU");
9880 }
9881
9882 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q));
9883 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_mxfp4->e));
9884 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9885 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9886 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
9887 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
9888 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
9889 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
9890 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
9891 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
9892 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
9893 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
9894 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
9895 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
9896 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
9897 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
9898 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
9899 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne20));
9900 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne21));
9901 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb21));
9902 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
9903 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
9904 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
9905 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
9906#else // GGML_OPENCL_SOA_Q
9907 kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32;
9908
9909 if (backend_ctx->gpu_family == INTEL) {
9910 sgs = 16;
9911 nsg = 2;
9912 ndst = 2;
9913 } else if (backend_ctx->gpu_family == ADRENO) {
9914 sgs = 64;
9915 nsg = 2;
9916 ndst = 2;
9917 } else {
9918 GGML_ASSERT(false && "TODO: Unknown GPU");
9919 }
9920
9921 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
9922 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
9923 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
9924 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
9925 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
9926 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
9927 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
9928 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
9929 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
9930 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
9931 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
9932 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
9933 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
9934 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
9935 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
9936 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
9937 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
9938 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne20));
9939 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne21));
9940 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb21));
9941 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
9942 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
9943 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
9944 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
9945 CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,nullptr));
9946#endif // GGML_OPENCL_SOA_Q
9947 break;
9948 }
9949 default:
9950 GGML_ASSERT(false && "not implemented");;
9951 }
9952
9953 int _ne1 = 1;
9954 int ne123 = dst_rows;
9955
9956 size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
9957 size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
9958
9959 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
9960}
9961
9962static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
9963 GGML_ASSERT(src0);
9964 GGML_ASSERT(src0->extra);
9965 GGML_ASSERT(dst);
9966 GGML_ASSERT(dst->extra);
9967 GGML_UNUSED(src1);
9968
9969 GGML_ASSERT(ggml_is_contiguous(src0));
9970
9971 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
9972
9973 float scale;
9974 float bias;
9975 memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float));
9976 memcpy(&bias, ((int32_t *) dst->op_params) + 1, sizeof(float));
9977
9978 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
9979 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
9980
9981 cl_ulong offset0 = extra0->offset + src0->view_offs;
9982 cl_ulong offsetd = extrad->offset + dst->view_offs;
9983
9984 cl_kernel kernel;
9985
9986 int n = ggml_nelements(dst);
9987
9988 if (n % 4 == 0) {
9989 kernel = backend_ctx->kernel_scale_f32_4;
9990 n /= 4;
9991 } else {
9992 kernel = backend_ctx->kernel_scale_f32;
9993 }
9994
9995 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
9996 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
9997 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
9998 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
9999 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale));
10000 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &bias));
10001
10002 size_t global_work_size[] = {(size_t)n, 1, 1};
10003 size_t local_work_size[] = {64, 1, 1};
10004
10005 size_t * local_work_size_ptr = local_work_size;
10006 if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
10007 local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
10008 }
10009
10010 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
10011}
10012
10013static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10014 GGML_ASSERT(src0);
10015 GGML_ASSERT(src0->extra);
10016 GGML_ASSERT(src1);
10017 GGML_ASSERT(src1->extra);
10018
10019 // GGML_OP_CPY happens between src0 and src1.
10020 // GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst.
10021 UNUSED(dst);
10022
10023 const int ne00 = src0 ? src0->ne[0] : 0;
10024 const int ne01 = src0 ? src0->ne[1] : 0;
10025 const int ne02 = src0 ? src0->ne[2] : 0;
10026 const int ne03 = src0 ? src0->ne[3] : 0;
10027
10028 const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
10029 const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
10030 const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
10031 const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
10032
10033 const int ne10 = src1 ? src1->ne[0] : 0;
10034 const int ne11 = src1 ? src1->ne[1] : 0;
10035 const int ne12 = src1 ? src1->ne[2] : 0;
10036 const int ne13 = src1 ? src1->ne[3] : 0;
10037
10038 const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
10039 const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
10040 const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
10041 const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
10042
10043 const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
10044 const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
10045
10046 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10047
10048 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10049 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
10050
10051 cl_ulong offset0 = extra0->offset + src0->view_offs;
10052 cl_ulong offset1 = extra1->offset + src1->view_offs;
10053
10054 cl_kernel kernel;
10055
10056 switch (src0t) {
10057 case GGML_TYPE_F32:
10058 switch (src1t) {
10059 case GGML_TYPE_F16:
10060 kernel = backend_ctx->kernel_cpy_f32_f16;
10061 break;
10062 case GGML_TYPE_F32:
10063 kernel = backend_ctx->kernel_cpy_f32_f32;
10064 break;
10065 default:
10066 GGML_ASSERT(false && "not implemented");
10067 }
10068 break;
10069 case GGML_TYPE_F16:
10070 switch (src1t) {
10071 case GGML_TYPE_F16:
10072 kernel = backend_ctx->kernel_cpy_f16_f16;
10073 break;
10074 case GGML_TYPE_F32:
10075 kernel = backend_ctx->kernel_cpy_f16_f32;
10076 break;
10077 default:
10078 GGML_ASSERT(false && "not implemented");
10079 }
10080 break;
10081 default:
10082 GGML_ASSERT(false && "not implemented");
10083 }
10084
10085 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
10086 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10087 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
10088 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
10089 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
10090 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
10091 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
10092 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
10093 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
10094 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
10095 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
10096 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
10097 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
10098 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
10099 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
10100 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
10101 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
10102 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
10103 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
10104 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
10105
10106 const int nth = MIN(64, ne00);
10107
10108 size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
10109 size_t local_work_size[] = {(size_t)nth, 1, 1};
10110
10111 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
10112}
10113
10114static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10115 ggml_cl_cpy(backend, src0, dst, nullptr);
10116 UNUSED(src1);
10117}
10118
10119static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10120 GGML_ASSERT(src0);
10121 GGML_ASSERT(src0->extra);
10122 GGML_ASSERT(dst);
10123 GGML_ASSERT(dst->extra);
10124
10125 UNUSED(src1);
10126
10127 int n_past = ((int32_t *)(dst->op_params))[0];
10128
10129 const int ne00 = src0 ? src0->ne[0] : 0;
10130 const int ne01 = src0 ? src0->ne[1] : 0;
10131 const int ne02 = src0 ? src0->ne[2] : 0;
10132
10133 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10134
10135 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10136 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10137
10138 cl_ulong offset0 = extra0->offset + src0->view_offs;
10139 cl_ulong offsetd = extrad->offset + dst->view_offs;
10140
10141 cl_kernel kernel;
10142
10143 if (ne00%8 == 0) {
10144 kernel = backend_ctx->kernel_diag_mask_inf_8;
10145
10146 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
10147 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10148 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
10149 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
10150 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
10151 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
10152 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past));
10153
10154 size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
10155 size_t local_work_size[] = {64, 1, 1};
10156
10157 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10158 } else {
10159 kernel = backend_ctx->kernel_diag_mask_inf;
10160
10161 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
10162 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10163 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
10164 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
10165 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
10166 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
10167 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past));
10168
10169 size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
10170 size_t local_work_size[] = {64, 1, 1};
10171
10172 size_t * local_work_size_ptr = local_work_size;
10173 if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
10174 local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
10175 }
10176
10177 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
10178 }
10179}
10180
10181static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10182 GGML_ASSERT(src0);
10183 GGML_ASSERT(src0->extra);
10184 GGML_ASSERT(dst);
10185 GGML_ASSERT(dst->extra);
10186
10187 // Softmax can now fuse KQ mask and KQ scale, which used to be two additional
10188 // ops before softmax. It now also fuses alibi if `max_bias > 0`. For llama,
10189 // alibi is not used; however, for some other models, it is used.
10190 // KQ_mask
10191 if (src1) {
10192 GGML_ASSERT(src1);
10193 GGML_ASSERT(src1->extra);
10194 }
10195
10196 const ggml_tensor * src2 = dst->src[2];
10197 if (src2) {
10198 GGML_ASSERT(src2->extra);
10199 }
10200
10201 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10202
10203 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10204 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10205
10206 ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
10207 ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr;
10208
10209 cl_ulong offset0 = extra0->offset + src0->view_offs;
10210 cl_ulong offsetd = extrad->offset + dst->view_offs;
10211
10212 cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
10213 cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
10214
10215 const int ne00 = src0->ne[0];
10216 const int ne01 = src0->ne[1];
10217 const int ne02 = src0->ne[2];
10218 const int ne03 = src0->ne[3];
10219
10220 const cl_long nb01 = src0->nb[1];
10221 const cl_long nb02 = src0->nb[2];
10222 const cl_long nb03 = src0->nb[3];
10223
10224 const int ne12 = src1 ? src1->ne[2] : 0;
10225 const int ne13 = src1 ? src1->ne[3] : 0;
10226
10227 const cl_long nb11 = src1 ? src1->nb[1] : 0;
10228 const cl_long nb12 = src1 ? src1->nb[2] : 0;
10229 const cl_long nb13 = src1 ? src1->nb[3] : 0;
10230
10231 const cl_long nb1 = dst->nb[1];
10232 const cl_long nb2 = dst->nb[2];
10233 const cl_long nb3 = dst->nb[3];
10234
10235 float scale, max_bias;
10236 memcpy(&scale, dst->op_params + 0, sizeof(float));
10237 memcpy(&max_bias, dst->op_params + 1, sizeof(float));
10238
10239 const int n_head = src0->ne[2];
10240 const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
10241
10242 const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
10243 const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
10244
10245 const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
10246
10247 // Local size must be wave size. Each workgroup is a wave, working on a row,
10248 // where a row corresponds to leading dimension.
10249 int nth = MIN(32, ne00);
10250
10251 if (backend_ctx->gpu_family == INTEL) {
10252 // This is the same as the initial value.
10253 nth = MIN(32, ne00);
10254 }
10255 else if (backend_ctx->gpu_family == ADRENO) {
10256 nth = 64;
10257 } else {
10258 GGML_ASSERT(false && "TODO: Unknown GPU");
10259 }
10260
10261 cl_kernel kernel;
10262
10263 if (ne00%4 == 0) {
10264 if (use_f16) {
10265 kernel = backend_ctx->kernel_soft_max_4_f16;
10266 } else {
10267 kernel = backend_ctx->kernel_soft_max_4;
10268 }
10269 } else {
10270 if (use_f16) {
10271 kernel = backend_ctx->kernel_soft_max_f16;
10272 } else {
10273 kernel = backend_ctx->kernel_soft_max;
10274 }
10275 }
10276
10277 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
10278 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10279 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), extra1 ? &extra1->data_device : &extra0->data_device));
10280 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
10281 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), extra2 ? &extra2->data_device : &extra0->data_device));
10282 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
10283 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
10284 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
10285 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
10286 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
10287 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
10288 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
10289 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
10290 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
10291 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
10292 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
10293 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
10294 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1));
10295 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2));
10296 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3));
10297 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &scale));
10298 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &max_bias));
10299 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(float), &m0));
10300 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &m1));
10301 CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &n_head_log2));
10302
10303 size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
10304 size_t local_work_size[] = {(size_t)nth, 1, 1};
10305
10306 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10307}
10308
10309static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10310 GGML_ASSERT(src0);
10311 GGML_ASSERT(src0->extra);
10312 GGML_ASSERT(src1);
10313 GGML_ASSERT(src1->extra);
10314 GGML_ASSERT(dst);
10315 GGML_ASSERT(dst->extra);
10316
10317 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10318
10319 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10320 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
10321 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10322
10323 cl_ulong offset0 = extra0->offset + src0->view_offs;
10324 cl_ulong offset1 = extra1->offset + src1->view_offs;
10325 cl_ulong offsetd = extrad->offset + dst->view_offs;
10326
10327 ggml_tensor * src2 = dst->src[2];
10328 ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr;
10329
10330 cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
10331
10332 const int ne00 = src0 ? src0->ne[0] : 0;
10333 const int ne01 = src0 ? src0->ne[1] : 0;
10334 const int ne02 = src0 ? src0->ne[2] : 0;
10335 const int ne03 = src0 ? src0->ne[3] : 0;
10336
10337 const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
10338 const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
10339 const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
10340 const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
10341
10342 const int ne10 = src1 ? src1->ne[0] : 0;
10343 const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
10344 const int ne12 = src1 ? src1->ne[2] : 0; UNUSED(ne12);
10345 const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
10346
10347 const int ne0 = dst ? dst->ne[0] : 0;
10348 const int ne1 = dst ? dst->ne[1] : 0;
10349 const int ne2 = dst ? dst->ne[2] : 0;
10350 const int ne3 = dst ? dst->ne[3] : 0;
10351
10352 const cl_ulong nb0 = dst ? dst->nb[0] : 0;
10353 const cl_ulong nb1 = dst ? dst->nb[1] : 0;
10354 const cl_ulong nb2 = dst ? dst->nb[2] : 0;
10355 const cl_ulong nb3 = dst ? dst->nb[3] : 0;
10356
10357 GGML_ASSERT(ne10 % ne02 == 0);
10358 GGML_ASSERT(ne10 >= ne02);
10359
10360 int nth = MIN(64, ne00);
10361
10362 const int n_past = ((int *) dst->op_params)[0];
10363 const int n_dims = ((int *) dst->op_params)[1];
10364 const int mode = ((int *) dst->op_params)[2];
10365 const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
10366
10367 float freq_base;
10368 float freq_scale;
10369 float ext_factor;
10370 float attn_factor;
10371 float beta_fast;
10372 float beta_slow;
10373 int32_t sections[4];
10374
10375 memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
10376 memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
10377 memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
10378 memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
10379 memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
10380 memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
10381 memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int32_t)*4);
10382
10383 const bool is_neox = mode & 2;
10384 const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
10385 const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
10386 const int is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
10387
10388 if (is_mrope) {
10389 GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
10390 }
10391
10392 if (is_vision) {
10393 GGML_ASSERT(n_dims == ne00/2);
10394 }
10395
10396 cl_kernel kernel;
10397
10398 if (is_neox) {
10399 switch (src0->type) {
10400 case GGML_TYPE_F32:
10401 kernel = backend_ctx->kernel_rope_neox_f32;
10402 break;
10403 case GGML_TYPE_F16:
10404 kernel = backend_ctx->kernel_rope_neox_f16;
10405 break;
10406 default:
10407 GGML_ASSERT(false);
10408 };
10409 } else if (is_mrope && !is_vision) {
10410 switch (src0->type) {
10411 case GGML_TYPE_F32:
10412 kernel = backend_ctx->kernel_rope_multi_f32;
10413 break;
10414 case GGML_TYPE_F16:
10415 kernel = backend_ctx->kernel_rope_multi_f16;
10416 break;
10417 default:
10418 GGML_ASSERT(false);
10419 };
10420 } else if (is_vision) {
10421 switch (src0->type) {
10422 case GGML_TYPE_F32:
10423 kernel = backend_ctx->kernel_rope_vision_f32;
10424 break;
10425 case GGML_TYPE_F16:
10426 kernel = backend_ctx->kernel_rope_vision_f16;
10427 break;
10428 default:
10429 GGML_ASSERT(false);
10430 }
10431 } else {
10432 switch (src0->type) {
10433 case GGML_TYPE_F32:
10434 kernel = backend_ctx->kernel_rope_norm_f32;
10435 break;
10436 case GGML_TYPE_F16:
10437 kernel = backend_ctx->kernel_rope_norm_f16;
10438 break;
10439 default:
10440 GGML_ASSERT(false);
10441 };
10442 }
10443
10444 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
10445 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10446 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
10447 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
10448 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), extra2 ? &extra2->data_device : &extra0->data_device));
10449 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
10450 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
10451 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
10452 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
10453 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
10454 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
10455 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne03));
10456 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb00));
10457 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb01));
10458 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb02));
10459 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb03));
10460 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne0));
10461 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne1));
10462 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne2));
10463 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne3));
10464 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb0));
10465 CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb1));
10466 CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb2));
10467 CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &nb3));
10468 CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &n_past));
10469 CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &n_dims));
10470 CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &n_ctx_orig));
10471 CL_CHECK(clSetKernelArg(kernel, 27, sizeof(float), &freq_base));
10472 CL_CHECK(clSetKernelArg(kernel, 28, sizeof(float), &freq_scale));
10473 CL_CHECK(clSetKernelArg(kernel, 29, sizeof(float), &ext_factor));
10474 CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float), &attn_factor));
10475 CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float), &beta_fast));
10476 CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &beta_slow));
10477 // both mrope and vision kernels have sections
10478 if (is_mrope || is_vision) {
10479 CL_CHECK(clSetKernelArg(kernel, 33, sizeof(int32_t)*4, §ions));
10480 }
10481 // only mrope has is_imrope
10482 if (is_mrope && !is_vision) {
10483 CL_CHECK(clSetKernelArg(kernel, 34, sizeof(int), &is_imrope));
10484 }
10485
10486 size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
10487 size_t local_work_size[] = {(size_t)nth, 1, 1};
10488
10489 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10490}
10491
10492static void ggml_cl_solve_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10493 GGML_ASSERT(src0);
10494 GGML_ASSERT(src0->extra);
10495 GGML_ASSERT(src1);
10496 GGML_ASSERT(src1->extra);
10497 GGML_ASSERT(dst);
10498 GGML_ASSERT(dst->extra);
10499
10500 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10501
10502 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10503 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
10504 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10505
10506 cl_ulong offset0 = extra0->offset + src0->view_offs;
10507 cl_ulong offset1 = extra1->offset + src1->view_offs;
10508 cl_ulong offsetd = extrad->offset + dst->view_offs;
10509
10510 cl_kernel kernel = backend_ctx->kernel_solve_tri_f32;
10511 GGML_ASSERT(kernel != nullptr);
10512
10513 const int n = src0->ne[0];
10514 const int k = src1->ne[0];
10515
10516 const cl_ulong nb00 = src0->nb[0];
10517 const cl_ulong nb01 = src0->nb[1];
10518 const cl_ulong nb02 = src0->nb[2];
10519 const cl_ulong nb03 = src0->nb[3];
10520
10521 const cl_ulong nb10 = src1->nb[0];
10522 const cl_ulong nb11 = src1->nb[1];
10523 const cl_ulong nb12 = src1->nb[2];
10524 const cl_ulong nb13 = src1->nb[3];
10525
10526 const cl_ulong nb0 = dst->nb[0];
10527 const cl_ulong nb1 = dst->nb[1];
10528 const cl_ulong nb2 = dst->nb[2];
10529 const cl_ulong nb3 = dst->nb[3];
10530
10531 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
10532 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10533 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
10534 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
10535 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
10536 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
10537 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n));
10538 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &k));
10539 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
10540 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
10541 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
10542 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
10543 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong),&nb10));
10544 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong),&nb11));
10545 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong),&nb12));
10546 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong),&nb13));
10547 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb0));
10548 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb1));
10549 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb2));
10550 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb3));
10551
10552 size_t global_work_size[3]= { (size_t)k, (size_t)dst->ne[2], (size_t)dst->ne[3]};
10553 size_t local_work_size[] = {16, 4, 1};
10554
10555 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10556}
10557
10558static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10559 GGML_ASSERT(src0);
10560 GGML_ASSERT(src1);
10561 GGML_ASSERT(src1->extra);
10562 GGML_ASSERT(dst);
10563 GGML_ASSERT(dst->extra);
10564
10565 // src0 - filter, src1 - input
10566 GGML_ASSERT(src1->type == GGML_TYPE_F32);
10567 GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
10568
10569 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10570
10571 ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
10572 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10573
10574 cl_ulong offset1 = extra1->offset + src1->view_offs;
10575 cl_ulong offsetd = extrad->offset + dst->view_offs;
10576
10577 const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
10578 const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
10579 const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
10580 const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
10581 const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
10582 const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
10583
10584 const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
10585
10586 const cl_long IC = src1->ne[is_2D ? 2 : 1];
10587 const cl_long IH = is_2D ? src1->ne[1] : 1;
10588 const cl_long IW = src1->ne[0];
10589
10590 const cl_long KH = is_2D ? src0->ne[1] : 1;
10591 const cl_long KW = src0->ne[0];
10592
10593 const cl_long OH = is_2D ? dst->ne[2] : 1;
10594 const cl_long OW = dst->ne[1];
10595
10596 // nb is byte offset, src is type float32
10597 const cl_ulong delta_offset = src1->nb[is_2D ? 2 : 1]/4;
10598 const cl_long batch = src1->ne[is_2D ? 3 : 2];
10599 const cl_ulong batch_offset = src1->nb[is_2D ? 3 : 2]/4;
10600
10601 const cl_long pelements = OW*KW*KH;
10602 const cl_long CHW = IC*KH*KW;
10603
10604 cl_kernel kernel;
10605
10606 if(dst->type == GGML_TYPE_F16) {
10607 kernel = backend_ctx->kernel_im2col_f16;
10608 } else {
10609 kernel = backend_ctx->kernel_im2col_f32;
10610 }
10611
10612 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra1->data_device));
10613 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset1));
10614 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
10615 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
10616 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &batch_offset));
10617 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &delta_offset));
10618 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long), &IW));
10619 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long), &IH));
10620 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long), &IC));
10621 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long), &OW));
10622 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_long), &OH));
10623 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_long), &KW));
10624 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_long), &KH));
10625 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_long), &pelements));
10626 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_long), &CHW));
10627 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &s0));
10628 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &s1));
10629 CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &p0));
10630 CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &p1));
10631 CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &d0));
10632 CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &d1));
10633
10634 const int num_blocks = (pelements + 256 - 1) / 256;
10635 size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
10636 size_t local_work_size[] = {256, 1, 1};
10637
10638 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10639}
10640
10641static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10642 GGML_ASSERT(src0);
10643 GGML_ASSERT(src0->extra);
10644 GGML_ASSERT(dst);
10645 GGML_ASSERT(dst->extra);
10646 GGML_UNUSED(src1);
10647
10648 GGML_ASSERT(src0->type == GGML_TYPE_F32);
10649 GGML_ASSERT( dst->type == GGML_TYPE_I32);
10650 GGML_ASSERT(ggml_is_contiguous(src0));
10651
10652 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10653
10654 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10655 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10656
10657 cl_ulong offset0 = extra0->offset + src0->view_offs;
10658 cl_ulong offsetd = extrad->offset + dst->view_offs;
10659
10660 const int ne00 = src0->ne[0];
10661 const int nrows = ggml_nrows(src0);
10662
10663 int ne00_padded = 1;
10664 while (ne00_padded < ne00) {
10665 ne00_padded *= 2;
10666 }
10667
10668 int order = (enum ggml_sort_order) dst->op_params[0];
10669
10670 cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
10671
10672 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
10673 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10674 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
10675 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
10676 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
10677 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00_padded));
10678 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &order));
10679 CL_CHECK(clSetKernelArg(kernel, 7, ne00_padded*sizeof(int), NULL));
10680
10681 size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
10682 size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
10683
10684 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10685}
10686
10687static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10688 GGML_ASSERT(src0);
10689 GGML_ASSERT(src0->extra);
10690 GGML_ASSERT(dst);
10691 GGML_ASSERT(dst->extra);
10692 GGML_UNUSED(src1);
10693
10694 GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
10695 GGML_ASSERT(ggml_is_contiguous(src0));
10696
10697 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10698
10699 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10700 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10701
10702 cl_ulong offset0 = extra0->offset + src0->view_offs;
10703 cl_ulong offsetd = extrad->offset + dst->view_offs;
10704
10705 const int ne00 = src0->ne[0];
10706 const int ne01 = src0->ne[1];
10707 const int ne02 = src0->ne[2];
10708 const int ne03 = src0->ne[3];
10709
10710 const cl_ulong nb01 = src0->nb[1];
10711 const cl_ulong nb02 = src0->nb[2];
10712 const cl_ulong nb03 = src0->nb[3];
10713
10714 const cl_ulong nb1 = dst->nb[1];
10715 const cl_ulong nb2 = dst->nb[2];
10716 const cl_ulong nb3 = dst->nb[3];
10717
10718 cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
10719
10720 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
10721 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10722 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
10723 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
10724 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
10725 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
10726 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
10727 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
10728 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
10729 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
10730 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
10731 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
10732 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
10733 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
10734
10735 size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
10736 size_t local_work_size[] = {(size_t)64, 1, 1};
10737
10738 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10739}
10740
10741static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10742 GGML_ASSERT(src0);
10743 GGML_ASSERT(src0->extra);
10744 GGML_ASSERT(dst);
10745 GGML_ASSERT(dst->extra);
10746
10747 GGML_ASSERT(ggml_is_contiguous_1(src0));
10748
10749 if (src1) {
10750 GGML_ASSERT(src1);
10751 GGML_ASSERT(src1->extra);
10752 GGML_ASSERT(ggml_are_same_shape(src0, src1));
10753 }
10754
10755 ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10756
10757 cl_kernel kernel;
10758 switch (ggml_get_glu_op(dst)) {
10759 case GGML_GLU_OP_GEGLU:
10760 if (dst->type == GGML_TYPE_F32) {
10761 kernel = backend_ctx->kernel_geglu;
10762 } else {
10763 kernel = backend_ctx->kernel_geglu_f16;
10764 }
10765 break;
10766 case GGML_GLU_OP_REGLU:
10767 if (dst->type == GGML_TYPE_F32) {
10768 kernel = backend_ctx->kernel_reglu;
10769 } else {
10770 kernel = backend_ctx->kernel_reglu_f16;
10771 }
10772 break;
10773 case GGML_GLU_OP_SWIGLU:
10774 if (dst->type == GGML_TYPE_F32) {
10775 kernel = backend_ctx->kernel_swiglu;
10776 } else {
10777 kernel = backend_ctx->kernel_swiglu_f16;
10778 }
10779 break;
10780 case GGML_GLU_OP_SWIGLU_OAI:
10781 kernel = backend_ctx->kernel_swiglu_oai;
10782 break;
10783 case GGML_GLU_OP_GEGLU_ERF:
10784 if (dst->type == GGML_TYPE_F32) {
10785 kernel = backend_ctx->kernel_geglu_erf;
10786 } else {
10787 kernel = backend_ctx->kernel_geglu_erf_f16;
10788 }
10789 break;
10790 case GGML_GLU_OP_GEGLU_QUICK:
10791 if (dst->type == GGML_TYPE_F32) {
10792 kernel = backend_ctx->kernel_geglu_quick;
10793 } else {
10794 kernel = backend_ctx->kernel_geglu_quick_f16;
10795 }
10796 break;
10797 default:
10798 GGML_ABORT("Unsupported glu op");
10799 }
10800
10801 ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10802 ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10803
10804 ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
10805
10806 cl_ulong offset0 = extra0->offset + src0->view_offs;
10807 cl_ulong offsetd = extrad->offset + dst->view_offs;
10808
10809 cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
10810
10811 const int ne0 = dst->ne[0];
10812
10813 const cl_ulong nb01 = src0->nb[1];
10814 const cl_ulong nb11 = src1 ? src1->nb[1] : nb01;
10815
10816 const cl_ulong nb1 = dst->nb[1];
10817
10818 const int swp = ggml_get_op_params_i32(dst, 1);
10819 const float alpha = ggml_get_op_params_f32(dst, 2);
10820 const float limit = ggml_get_op_params_f32(dst, 3);
10821
10822 const int ne00_off = src1 ? 0 : (swp ? ne0 : 0);
10823 const int ne10_off = src1 ? 0 : (swp ? 0 : ne0);
10824
10825 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
10826 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10827 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), src1 ? &extra1->data_device : &extra0->data_device));
10828 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
10829 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
10830 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
10831 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
10832 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb11));
10833 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
10834 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb1));
10835 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne00_off));
10836 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10_off));
10837
10838 if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU_OAI) {
10839 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &limit));
10840 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &alpha));
10841 }
10842
10843 const size_t nrows = ggml_nrows(src0);
10844 size_t nth = 512;
10845 size_t global_work_size[] = {nrows*nth, 1, 1};
10846 size_t local_work_size[] = {nth, 1, 1};
10847
10848 backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10849}
10850
10851//------------------------------------------------------------------------------
10852// Op offloading
10853//------------------------------------------------------------------------------
10854
10855typedef void (*ggml_cl_func_t)(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
10856
10857bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) {
10858 ggml_cl_func_t func = nullptr;
10859
10860 ggml_tensor * src0 = tensor->src[0];
10861 ggml_tensor * src1 = tensor->src[1];
10862
10863 const bool any_on_device = tensor->extra
10864 || (src0 != nullptr && src0->extra)
10865 || (src1 != nullptr && src1->extra);
10866
10867 switch (tensor->op) {
10868 case GGML_OP_GET_ROWS:
10869 if (!any_on_device) {
10870 return false;
10871 }
10872 func = ggml_cl_get_rows;
10873 break;
10874 case GGML_OP_SET_ROWS:
10875 if (!any_on_device) {
10876 return false;
10877 }
10878 func = ggml_cl_set_rows;
10879 break;
10880 case GGML_OP_CPY:
10881 if (!any_on_device) {
10882 return false;
10883 }
10884 func = ggml_cl_cpy;
10885 break;
10886 case GGML_OP_DUP:
10887 case GGML_OP_CONT:
10888 if (!any_on_device) {
10889 return false;
10890 }
10891 func = ggml_cl_dup;
10892 break;
10893 case GGML_OP_ADD:
10894 if (!any_on_device) {
10895 return false;
10896 }
10897 func = ggml_cl_add;
10898 break;
10899 case GGML_OP_ADD_ID:
10900 if (!any_on_device) {
10901 return false;
10902 }
10903 func = ggml_cl_add_id;
10904 break;
10905 case GGML_OP_MUL:
10906 if (!any_on_device) {
10907 return false;
10908 }
10909 func = ggml_cl_mul;
10910 break;
10911 case GGML_OP_DIV:
10912 if (!any_on_device) {
10913 return false;
10914 }
10915 func = ggml_cl_div;
10916 break;
10917 case GGML_OP_SUB:
10918 if (!any_on_device) {
10919 return false;
10920 }
10921 func = ggml_cl_sub;
10922 break;
10923 case GGML_OP_SQR:
10924 if (!any_on_device) {
10925 return false;
10926 }
10927 func = ggml_cl_sqr;
10928 break;
10929 case GGML_OP_SQRT:
10930 if (!any_on_device) {
10931 return false;
10932 }
10933 func = ggml_cl_sqrt;
10934 break;
10935 case GGML_OP_MEAN:
10936 if (!any_on_device) {
10937 return false;
10938 }
10939 func = ggml_cl_mean;
10940 break;
10941 case GGML_OP_UNARY:
10942 switch (ggml_get_unary_op(tensor)) {
10943 case GGML_UNARY_OP_GELU:
10944 if (!any_on_device) {
10945 return false;
10946 }
10947 func = ggml_cl_gelu;
10948 break;
10949 case GGML_UNARY_OP_GELU_ERF:
10950 if (!any_on_device) {
10951 return false;
10952 }
10953 func = ggml_cl_gelu_erf;
10954 break;
10955 case GGML_UNARY_OP_GELU_QUICK:
10956 if (!any_on_device) {
10957 return false;
10958 }
10959 func = ggml_cl_gelu_quick;
10960 break;
10961 case GGML_UNARY_OP_SILU:
10962 if (!any_on_device) {
10963 return false;
10964 }
10965 func = ggml_cl_silu;
10966 break;
10967 case GGML_UNARY_OP_RELU:
10968 if (!any_on_device) {
10969 return false;
10970 }
10971 func = ggml_cl_relu;
10972 break;
10973 case GGML_UNARY_OP_SIGMOID:
10974 if (!any_on_device) {
10975 return false;
10976 }
10977 func = ggml_cl_sigmoid;
10978 break;
10979 case GGML_UNARY_OP_TANH:
10980 if (!any_on_device) {
10981 return false;
10982 }
10983 func = ggml_cl_tanh;
10984 break;
10985 case GGML_UNARY_OP_EXPM1:
10986 if (!any_on_device) {
10987 return false;
10988 }
10989 func = ggml_cl_expm1;
10990 break;
10991 case GGML_UNARY_OP_SOFTPLUS:
10992 if (!any_on_device) {
10993 return false;
10994 }
10995 func = ggml_cl_softplus;
10996 break;
10997 default:
10998 return false;
10999 } break;
11000 case GGML_OP_GLU:
11001 if (!any_on_device) {
11002 return false;
11003 }
11004 func = ggml_cl_glu;
11005 break;
11006 case GGML_OP_TRI:
11007 if (!any_on_device) {
11008 return false;
11009 }
11010 func = ggml_cl_tri;
11011 break;
11012 case GGML_OP_FILL:
11013 if (!any_on_device) {
11014 return false;
11015 }
11016 func = ggml_cl_fill;
11017 break;
11018 case GGML_OP_CLAMP:
11019 if (!any_on_device) {
11020 return false;
11021 }
11022 func = ggml_cl_clamp;
11023 break;
11024 case GGML_OP_NORM:
11025 if (!any_on_device) {
11026 return false;
11027 }
11028 func = ggml_cl_norm;
11029 break;
11030 case GGML_OP_RMS_NORM:
11031 if (!any_on_device) {
11032 return false;
11033 }
11034 func = ggml_cl_rms_norm;
11035 break;
11036 case GGML_OP_GROUP_NORM:
11037 if (!any_on_device) {
11038 return false;
11039 }
11040 func = ggml_cl_group_norm;
11041 break;
11042 case GGML_OP_REPEAT:
11043 if (!any_on_device) {
11044 return false;
11045 }
11046 func = ggml_cl_repeat;
11047 break;
11048 case GGML_OP_PAD:
11049 if (!any_on_device) {
11050 return false;
11051 }
11052 ggml_cl_pad(backend, tensor->src[0], tensor);
11053 return true;
11054 case GGML_OP_UPSCALE:
11055 if (!any_on_device) {
11056 return false;
11057 }
11058 ggml_cl_upscale(backend, tensor->src[0], tensor);
11059 return true;
11060 case GGML_OP_CONV_2D:
11061 if (!any_on_device) {
11062 return false;
11063 }
11064 func = ggml_cl_conv_2d;
11065 break;
11066 case GGML_OP_SSM_CONV:
11067 if (!any_on_device) {
11068 return false;
11069 }
11070 func = ggml_cl_ssm_conv;
11071 break;
11072 case GGML_OP_CONCAT:
11073 if (!any_on_device) {
11074 return false;
11075 }
11076 func = ggml_cl_concat;
11077 break;
11078 case GGML_OP_TIMESTEP_EMBEDDING:
11079 if (!any_on_device) {
11080 return false;
11081 }
11082 ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
11083 return true;
11084 case GGML_OP_MUL_MAT:
11085 if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
11086 return false;
11087 }
11088 func = ggml_cl_mul_mat;
11089 break;
11090 case GGML_OP_MUL_MAT_ID:
11091 if (!any_on_device) {
11092 return false;
11093 }
11094 func = ggml_cl_mul_mat_id;
11095 break;
11096 case GGML_OP_SCALE:
11097 if (!any_on_device) {
11098 return false;
11099 }
11100 func = ggml_cl_scale;
11101 break;
11102 case GGML_OP_RESHAPE:
11103 case GGML_OP_VIEW:
11104 case GGML_OP_PERMUTE:
11105 case GGML_OP_TRANSPOSE:
11106 if (!any_on_device) {
11107 return false;
11108 }
11109 func = ggml_cl_nop;
11110 break;
11111 case GGML_OP_DIAG_MASK_INF:
11112 if (!any_on_device) {
11113 return false;
11114 }
11115 func = ggml_cl_diag_mask_inf;
11116 break;
11117 case GGML_OP_SOFT_MAX:
11118 if (!any_on_device) {
11119 return false;
11120 }
11121 func = ggml_cl_soft_max;
11122 break;
11123 case GGML_OP_ROPE:
11124 if (!any_on_device) {
11125 return false;
11126 }
11127 func = ggml_cl_rope;
11128 break;
11129 case GGML_OP_SOLVE_TRI:
11130 if (!any_on_device) {
11131 return false;
11132 }
11133 func = ggml_cl_solve_tri;
11134 break;
11135 case GGML_OP_IM2COL:
11136 if (!any_on_device) {
11137 return false;
11138 }
11139 func = ggml_cl_im2col;
11140 break;
11141 case GGML_OP_ARGSORT:
11142 if (!any_on_device) {
11143 return false;
11144 }
11145 func = ggml_cl_argsort;
11146 break;
11147 case GGML_OP_SUM_ROWS:
11148 if (!any_on_device) {
11149 return false;
11150 }
11151 func = ggml_cl_sum_rows;
11152 break;
11153 case GGML_OP_FLASH_ATTN_EXT:
11154 if (!any_on_device) {
11155 return false;
11156 }
11157 ggml_cl_flash_attn(backend, tensor->src[0], tensor->src[1], tensor);
11158 return true;
11159 default:
11160 return false;
11161 }
11162
11163 func(backend, tensor->src[0], tensor->src[1], tensor);
11164 return true;
11165}