1#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
    2#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
    3
    4// suppress warnings in CL headers for GCC and Clang
    5#pragma GCC diagnostic ignored "-Woverlength-strings"
    6#ifdef __clang__
    7#pragma GCC diagnostic ignored "-Wgnu-anonymous-struct"
    8#endif
    9
   10#include "ggml-opencl.h"
   11#include "ggml-backend.h"
   12#include "ggml-impl.h"
   13#include "ggml-backend-impl.h"
   14#include "ggml.h"
   15
   16#include <CL/cl.h>
   17
   18#include <inttypes.h>
   19#include <string.h>
   20
   21#include <cstddef>
   22#include <cstdint>
   23#include <fstream>
   24#include <vector>
   25#include <string>
   26#include <cmath>
   27#include <map>
   28#include <memory>
   29#include <charconv>
   30#include <mutex>
   31
   32#undef MIN
   33#undef MAX
   34#define MIN(a, b) ((a) < (b) ? (a) : (b))
   35#define MAX(a, b) ((a) > (b) ? (a) : (b))
   36#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
   37
   38#define UNUSED(x) (void)(x)
   39
   40#define CL_CHECK(err)                                               \
   41    do {                                                            \
   42        cl_int err_ = (err);                                        \
   43        if (err_ != CL_SUCCESS) {                                   \
   44            GGML_LOG_ERROR("ggml_opencl: %s error %d at %s:%d\n",  \
   45                #err, err_, __FILE__, __LINE__);                    \
   46            GGML_ASSERT(0);                                         \
   47        }                                                           \
   48    } while (0)
   49
   50//------------------------------------------------------------------------------
   51// OpenCL
   52//------------------------------------------------------------------------------
   53
   54bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
   55
   56// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
   57// Precompute mp (m' in the paper) and L such that division
   58// can be computed using a multiply (high 32b of 64b result)
   59// and a shift:
   60//
   61// n/d = (mulhi(n, mp) + n) >> L;
   62struct fastdiv_vals {
   63    uint32_t mp;
   64    uint32_t L;
   65    uint32_t d;
   66    uint32_t pad;
   67};
   68static_assert(sizeof(fastdiv_vals) == 16, "fastdiv_vals size incorrect");
   69
   70static fastdiv_vals init_fastdiv_values(uint64_t d_64) {
   71    GGML_ASSERT(d_64 != 0);
   72    GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
   73
   74    uint32_t d = (uint32_t)d_64;
   75
   76    // compute L = ceil(log2(d));
   77    uint32_t L = 0;
   78    while (L < 32 && (uint32_t{ 1 } << L) < d) {
   79        L++;
   80    }
   81
   82    uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
   83    // pack divisor as well to reduce error surface
   84    return { mp, L, d, 0 };
   85}
   86
   87enum GPU_FAMILY {
   88    ADRENO,
   89    INTEL,
   90    UNKNOWN,
   91};
   92
   93enum ADRENO_GPU_GEN {
   94    ADRENO_UNKNOWN,
   95    A7X,
   96    A8X,
   97    X1E,
   98};
   99
  100enum ADRENO_CL_COMPILER_TYPE {
  101    E031,
  102    DX,
  103};
  104
  105struct ggml_cl_version {
  106    cl_uint major = 0;
  107    cl_uint minor = 0;
  108};
  109
  110
  111struct ggml_cl_compiler_version {
  112    ADRENO_CL_COMPILER_TYPE type;
  113    int major = -1;
  114    int minor = -1;
  115    int patch = -1;
  116
  117    bool same(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
  118        return major == x && minor == y && patch == z && type == t;
  119    }
  120    bool newer_than(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
  121        return major*10000 + minor*100 + patch > x*10000 + y*100 + z && type == t;
  122    }
  123    bool newer_than_or_same(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
  124        return same(t, x, y, z) || newer_than(t, x, y, z);
  125    }
  126};
  127
  128static size_t align_to(size_t value, size_t to_alignment) {
  129    GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
  130    GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
  131
  132    return ((value + to_alignment - 1) / to_alignment) * to_alignment;
  133}
  134
  135
  136// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
  137static ggml_cl_version parse_cl_version(std::string_view str) {
  138    size_t major_str_begin = 0;
  139    size_t major_str_end   = str.find(".", major_str_begin);
  140    if (major_str_end == std::string::npos) {
  141        return {};
  142    }
  143
  144    size_t minor_str_begin = major_str_end + 1;
  145    size_t minor_str_end   = str.find(" ", minor_str_begin);
  146    if (minor_str_end == std::string::npos) {
  147        return {};
  148    }
  149
  150    cl_uint version_major;
  151    if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
  152        return {};
  153    }
  154
  155    cl_uint version_minor;
  156    if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
  157        return {};
  158    }
  159    return { version_major, version_minor };
  160}
  161
  162// Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
  163static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
  164    size_t param_size;
  165    CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, &param_size));
  166    std::unique_ptr<char[]> param_storage(new char[param_size]);
  167    CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
  168
  169    auto              param_value    = std::string_view(param_storage.get(), param_size);
  170    const std::string version_prefix = "OpenCL ";  // Suffix: "XX.YY <platform-specific-info>"
  171    if (param_value.find(version_prefix) != 0) {
  172        return {};
  173    }
  174    param_value.remove_prefix(version_prefix.length());
  175    return parse_cl_version(param_value);
  176}
  177
  178// Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
  179static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
  180    size_t param_size;
  181
  182#if CL_TARGET_OPENCL_VERSION >= 300
  183    if (platform_version.major >= 3) {
  184        CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, &param_size));
  185        if (!param_size) {
  186            return {};
  187        }
  188
  189        std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
  190        CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
  191        unsigned versions_count = param_size / sizeof(cl_name_version);
  192
  193        cl_version version_max = 0;
  194        for (unsigned i = 0; i < versions_count; i++) {
  195            version_max = std::max<cl_version>(versions[i].version, version_max);
  196        }
  197
  198        return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
  199    }
  200#else
  201    GGML_UNUSED(platform_version);
  202#endif  // CL_TARGET_OPENCL_VERSION >= 300
  203
  204    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, &param_size));
  205    if (!param_size) {
  206        return {};
  207    }
  208
  209    std::unique_ptr<char[]> param_storage(new char[param_size]);
  210    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
  211    auto param_value = std::string_view(param_storage.get(), param_size);
  212
  213    const std::string version_prefix = "OpenCL C ";  // Suffix: "XX.YY <platform-specific-info>"
  214    if (param_value.find(version_prefix) != 0) {
  215        return {};
  216    }
  217    param_value.remove_prefix(version_prefix.length());
  218
  219    return parse_cl_version(param_value);
  220}
  221
  222static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
  223    if (strstr(device_name, "730") ||
  224        strstr(device_name, "740") ||
  225        strstr(device_name, "750")) {
  226        return ADRENO_GPU_GEN::A7X;
  227    }
  228
  229    if (strstr(device_name, "830") ||
  230        strstr(device_name, "840")) {
  231        return ADRENO_GPU_GEN::A8X;
  232    }
  233
  234    if (strstr(device_name, "X1")) {
  235        return ADRENO_GPU_GEN::X1E;
  236    }
  237
  238    return ADRENO_GPU_GEN::ADRENO_UNKNOWN;
  239}
  240
  241static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *driver_version) {
  242    std::string driver_ver_str(driver_version);
  243    ADRENO_CL_COMPILER_TYPE type = ADRENO_CL_COMPILER_TYPE::E031;
  244    size_t compiler_ver_pos = driver_ver_str.find("E031");
  245    size_t compiler_ver_len = 13;
  246    size_t compiler_major_offset = 5;
  247    size_t compiler_minor_offset = 8;
  248    size_t compiler_patch_offset = 11;
  249
  250    if (compiler_ver_pos == std::string::npos) {
  251        compiler_ver_pos = driver_ver_str.find("DX");
  252        if (compiler_ver_pos == std::string::npos) {
  253            return {};
  254        }
  255        type = ADRENO_CL_COMPILER_TYPE::DX;
  256        compiler_ver_len = 11;
  257        compiler_major_offset = 3;
  258    }
  259
  260    std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len);
  261    int major = std::atoi(compiler_ver_str.substr(compiler_major_offset, 2).c_str());
  262    int minor = std::atoi(compiler_ver_str.substr(compiler_minor_offset, 2).c_str());
  263    int patch = std::atoi(compiler_ver_str.substr(compiler_patch_offset, 2).c_str());
  264    return { type, major, minor, patch };
  265}
  266
  267// cl buffer wrapper
  268struct ggml_cl_buffer {
  269    cl_mem buffer;
  270    size_t size;
  271
  272    ggml_cl_buffer()
  273        : buffer(nullptr), size(0) {}
  274
  275    ~ggml_cl_buffer() {
  276        if (buffer) {
  277            CL_CHECK(clReleaseMemObject(buffer));
  278        }
  279    }
  280
  281    void allocate(cl_context context, size_t new_size) {
  282        if (new_size > size) {
  283            size = new_size;
  284            if (buffer) {
  285                CL_CHECK(clReleaseMemObject(buffer));
  286            }
  287            cl_int err;
  288            CL_CHECK((buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
  289        }
  290    }
  291};
  292
  293// Profiling
  294struct ProfilingInfo {
  295    std::string op_name;
  296    std::string kernel_name;
  297
  298    cl_kernel kernel;
  299    cl_event evt;
  300
  301    cl_ulong cmd_queued;
  302    cl_ulong cmd_submit;
  303    cl_ulong cmd_start;
  304    cl_ulong cmd_end;
  305    cl_ulong overhead_start;
  306    cl_ulong overhead_end;
  307    // For the times below, see spec for clGetEventProfilingInfo
  308    // The time kernel spent in cmd queue - SUBMIT - QUEUED
  309    cl_ulong cmd_queued_duration_ns;
  310    // The time kernel spent for submission - START - SUBMIT
  311    cl_ulong cmd_submit_duration_ns;
  312    // Kernel execution time in nanoseconds - END - START
  313    cl_ulong cmd_duration_ns;
  314    // The time for the kernel to complete - COMPLETE - END
  315    cl_ulong cmd_complete_duration_ns;
  316    // Total time to finish the kernel - COMPELTE - QUEUED
  317    cl_ulong cmd_total_duration_ns;
  318    // Global and local work sizes.
  319    size_t global_size[3];
  320    size_t local_size[3];
  321    // Op output size.
  322    size_t output_size[4];
  323};
  324
  325static void populateProfilingInfo(
  326        ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
  327        size_t global_size[3], size_t local_size[3],
  328        const ggml_tensor * tensor) {
  329    info.op_name     = tensor->name;
  330    info.kernel      = kernel;
  331    info.evt         = evt;
  332
  333    // 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
  334    info.local_size[0] = 0;
  335    info.local_size[1] = 0;
  336    info.local_size[2] = 0;
  337
  338    info.global_size[0] = 0;
  339    info.global_size[1] = 0;
  340    info.global_size[2] = 0;
  341
  342    if (local_size) {
  343        for (cl_uint i = 0; i < work_dim; ++i) {
  344            info.local_size[i] = local_size[i];
  345        }
  346    }
  347
  348    for (cl_uint i = 0; i < work_dim; ++i) {
  349        info.global_size[i] = global_size[i];
  350    }
  351
  352    info.output_size[0] = tensor->ne[0];
  353    info.output_size[1] = tensor->ne[1];
  354    info.output_size[2] = tensor->ne[2];
  355    info.output_size[3] = tensor->ne[3];
  356}
  357
  358struct ggml_backend_opencl_context;
  359
  360// backend device context
  361struct ggml_backend_opencl_device_context {
  362    cl_platform_id platform;
  363    std::string platform_name;
  364
  365    cl_device_id   device;
  366    std::string    device_name;
  367    cl_device_type device_type;
  368    std::string    device_version;
  369
  370    // Initialized by ggml_cl2_init().
  371    ggml_backend_opencl_context * backend_ctx = nullptr;
  372
  373    // Initialized by ggml_backend_opencl_device_get_buffer_type()
  374    ggml_backend_buffer_type buffer_type;
  375
  376    cl_context context = nullptr;
  377};
  378
  379// backend context
  380struct ggml_backend_opencl_context {
  381    int ref_count;
  382
  383    cl_device_id device;
  384    std::string device_name;
  385
  386    std::string driver_version;
  387
  388    GPU_FAMILY gpu_family;
  389    ADRENO_GPU_GEN adreno_gen;
  390
  391    cl_int alignment;
  392    size_t max_alloc_size;
  393    size_t max_workgroup_size;
  394    bool fp16_support;
  395    bool has_vector_subgroup_broadcast;
  396    bool disable_fusion;
  397    ggml_cl_compiler_version adreno_cl_compiler_version;
  398
  399    int adreno_wave_size;
  400
  401    cl_bool non_uniform_workgroups;
  402    size_t  image_max_buffer_size;
  403
  404    cl_context context;
  405    cl_command_queue queue;
  406
  407    // prealloc buffers for transposing weights and activations
  408    ggml_cl_buffer prealloc_quant_trans;
  409    ggml_cl_buffer prealloc_scales_trans;
  410    ggml_cl_buffer prealloc_act_trans;
  411
  412    // prealloc buffers for src0 and src1
  413    ggml_cl_buffer prealloc_src0;
  414    ggml_cl_buffer prealloc_src1;
  415
  416    cl_program program_add;
  417    cl_program program_add_id;
  418    cl_program program_clamp;
  419    cl_program program_cpy;
  420    cl_program program_cvt;
  421    cl_program program_diag_mask_inf;
  422    cl_program program_gelu;
  423    cl_program program_gemv_noshuffle_general;
  424    cl_program program_gemv_noshuffle;
  425    cl_program program_get_rows;
  426    cl_program program_set_rows;
  427    cl_program program_glu;
  428    cl_program program_im2col_f16;
  429    cl_program program_im2col_f32;
  430    cl_program program_mul_mat_Ab_Bi_8x4;
  431    cl_program program_mul_mv_q4_0_f32;
  432    cl_program program_mul_mv_q4_0_f32_v;
  433    cl_program program_mul_mv_q4_0_f32_8x_flat;
  434    cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
  435    cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
  436    cl_program program_mul_mv_q6_K;
  437    cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat;
  438    cl_program program_mul_mv_mxfp4_f32;
  439    cl_program program_mul_mv_mxfp4_f32_flat;
  440    cl_program program_mul_mv_f16_f16;
  441    cl_program program_mul_mv_f16_f32_1row;
  442    cl_program program_mul_mv_f16_f32_l4;
  443    cl_program program_mul_mv_f16_f32;
  444    cl_program program_mul_mv_f32_f32;
  445    cl_program program_mul;
  446    cl_program program_mul_mat_f16_f32_tiled;
  447    cl_program program_mul_mm_f16_f32_kqv;
  448    cl_program program_mul_mm_f16_f32_kq;
  449    cl_program program_div;
  450    cl_program program_sub;
  451    cl_program program_norm;
  452    cl_program program_relu;
  453    cl_program program_rms_norm;
  454    cl_program program_group_norm;
  455    cl_program program_rope;
  456    cl_program program_silu;
  457    cl_program program_sigmoid;
  458    cl_program program_softmax_f32;
  459    cl_program program_softmax_f16;
  460    cl_program program_softmax_4_f32;
  461    cl_program program_softmax_4_f16;
  462    cl_program program_argsort_f32_i32;
  463    cl_program program_sum_rows_f32;
  464    cl_program program_pad;
  465    cl_program program_upscale;
  466    cl_program program_conv_2d_f16;
  467    cl_program program_conv_2d_f32;
  468    cl_program program_conv_2d_f16_f32;
  469    cl_program program_tsembd;
  470    cl_program program_gemv_moe_mxfp4_f32, program_gemm_moe_mxfp4_f32;
  471    cl_program program_mul_mv_id_q4_0_f32_8x_flat;
  472    cl_program program_mul_mv_id_q8_0_f32, program_mul_mv_id_q8_0_f32_flat;
  473    cl_program program_mul_mv_id_mxfp4_f32;
  474    cl_program program_mul_mv_id_mxfp4_f32_flat;
  475    cl_program program_mul_mm_f32_f32_l4_lm;
  476    cl_program program_mul_mm_f16_f32_l4_lm;
  477    cl_program program_mul_mm_q8_0_f32_l4_lm;
  478
  479    cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
  480    cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
  481    cl_kernel kernel_div, kernel_div_row, kernel_div_f16, kernel_div_row_f16;
  482    cl_kernel kernel_sub, kernel_sub_row, kernel_sub_f16, kernel_sub_row_f16;
  483    cl_kernel kernel_add_id;
  484    cl_kernel kernel_scale_f32, kernel_scale_f32_4;
  485    cl_kernel kernel_sqr_cont_f32, kernel_sqr_cont_f32_4, kernel_sqr_cont_f16, kernel_sqr_cont_f16_4;
  486    cl_kernel kernel_sqrt_cont_f32, kernel_sqrt_cont_f32_4, kernel_sqrt_cont_f16, kernel_sqrt_cont_f16_4;
  487    cl_kernel kernel_mean_f32;
  488    cl_kernel kernel_silu, kernel_silu_4;
  489    cl_kernel kernel_gelu, kernel_gelu_4;
  490    cl_kernel kernel_gelu_erf, kernel_gelu_erf_4;
  491    cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
  492    cl_kernel kernel_relu;
  493    cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
  494    cl_kernel kernel_tri;
  495    cl_kernel kernel_fill;
  496    cl_kernel kernel_clamp;
  497    cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu, kernel_swiglu_oai, kernel_geglu_erf, kernel_geglu_quick,
  498              kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16, kernel_geglu_erf_f16, kernel_geglu_quick_f16;
  499    cl_kernel kernel_norm, kernel_norm_mul_add;
  500    cl_kernel kernel_rms_norm, kernel_rms_norm_mul;
  501    cl_kernel kernel_group_norm, kernel_group_norm_mul_add;
  502    cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
  503    cl_kernel kernel_soft_max, kernel_soft_max_4;
  504    cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
  505    std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f16;
  506    std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f16_q1;
  507    std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f32;
  508    std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f32_q1;
  509    std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f32_f16;
  510    std::map<std::pair<int, int>, cl_kernel> kernels_flash_attn_f32_f16_q1;
  511    std::map<std::pair<int, int>, int>       kernels_flash_attn_bm;
  512    std::map<std::pair<int, int>, int>       kernels_flash_attn_bn;
  513    cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
  514    cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
  515    cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
  516    cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
  517    cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
  518    cl_kernel kernel_mul_mat_f32_f32;
  519    cl_kernel kernel_mul_mat_f16_f16;
  520    cl_kernel kernel_mul_mat_f16_f32_1row;
  521    cl_kernel kernel_mul_mat_f16_f32;
  522    cl_kernel kernel_mul_mat_f16_f32_l4;
  523    cl_kernel kernel_mul_mat_f16_f32_tiled;
  524    cl_kernel kernel_mul_mm_f16_f32_kqv;
  525    cl_kernel kernel_mul_mm_f16_f32_kq;
  526    cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
  527    cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
  528    cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
  529    cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans;
  530    cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
  531    cl_kernel kernel_convert_block_q4_0_noshuffle;
  532    cl_kernel kernel_restore_block_q4_0_noshuffle;
  533    cl_kernel kernel_convert_block_q6_K, kernel_restore_block_q6_K;
  534    cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
  535    cl_kernel kernel_mul_mv_q4_K_f32;
  536    cl_kernel kernel_mul_mv_q6_K_f32;
  537    cl_kernel kernel_mul_mv_q6_K_f32_flat;
  538    cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
  539    cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
  540    cl_kernel kernel_solve_tri_f32;
  541    cl_kernel kernel_im2col_f32, kernel_im2col_f16;
  542    cl_kernel kernel_argsort_f32_i32;
  543    cl_kernel kernel_sum_rows_f32;
  544    cl_kernel kernel_repeat_f32;
  545    cl_kernel kernel_pad;
  546    cl_kernel kernel_tanh_f32, kernel_tanh_f32_4, kernel_tanh_f32_nc;
  547    cl_kernel kernel_tanh_f16, kernel_tanh_f16_4, kernel_tanh_f16_nc;
  548    cl_kernel kernel_expm1_f32_nd;
  549    cl_kernel kernel_expm1_f16_nd;
  550    cl_kernel kernel_softplus_f32_nd;
  551    cl_kernel kernel_softplus_f16_nd;
  552    cl_kernel kernel_upscale;
  553    cl_kernel kernel_upscale_bilinear;
  554    cl_kernel kernel_concat_f32;
  555    cl_kernel kernel_conv_2d_f16;
  556    cl_kernel kernel_conv_2d_f32;
  557    cl_kernel kernel_conv_2d_f16_f32;
  558    cl_kernel kernel_ssm_conv_f32_f32, kernel_ssm_conv_f32_f32_4;
  559    cl_kernel kernel_timestep_embedding;
  560    cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
  561    cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
  562    cl_kernel kernel_mul_mv_id_q8_0_f32, kernel_mul_mv_id_q8_0_f32_flat;
  563    cl_kernel kernel_mul_mv_id_mxfp4_f32;
  564    cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
  565    cl_kernel kernel_mul_mm_f32_f32_l4_lm;
  566    cl_kernel kernel_mul_mm_f16_f32_l4_lm;
  567    cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;
  568    cl_kernel kernel_mul_mm_q6_k_f32_l4_lm;
  569
  570    std::vector<ProfilingInfo> profiling_info;
  571
  572    void write_profiling_info() {
  573        FILE * fperf = fopen("cl_profiling.csv", "w");
  574        if (!fperf) {
  575            GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
  576            return;
  577        }
  578
  579        // Populate profiling info
  580        for (ProfilingInfo & info : profiling_info) {
  581            cl_ulong cmd_queued;
  582            cl_ulong cmd_submit;
  583            cl_ulong cmd_start;
  584            cl_ulong cmd_end;
  585            cl_ulong cmd_complete;
  586
  587            CL_CHECK(clWaitForEvents(1, &info.evt));
  588            CL_CHECK(clGetEventProfilingInfo(
  589                info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
  590            CL_CHECK(clGetEventProfilingInfo(
  591                info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
  592            CL_CHECK(clGetEventProfilingInfo(
  593                info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
  594            CL_CHECK(clGetEventProfilingInfo(
  595                info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
  596            CL_CHECK(clGetEventProfilingInfo(
  597                info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
  598            CL_CHECK(clReleaseEvent(info.evt));
  599
  600            char kernel_name[512];
  601            CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
  602                sizeof(kernel_name), kernel_name, NULL));
  603            info.kernel_name = kernel_name;
  604
  605            info.cmd_queued = cmd_queued;
  606            info.cmd_submit = cmd_submit;
  607            info.cmd_start  = cmd_start;
  608            info.cmd_end    = cmd_end;
  609
  610            info.cmd_queued_duration_ns     = cmd_submit    - cmd_queued;
  611            info.cmd_submit_duration_ns     = cmd_start     - cmd_submit;
  612            info.cmd_duration_ns            = cmd_end       - cmd_start;
  613            info.cmd_complete_duration_ns   = cmd_complete  - cmd_end;
  614            info.cmd_total_duration_ns      = cmd_complete  - cmd_queued;
  615        }
  616
  617        // Dump a csv
  618        fprintf(fperf, "op name, kernel name, exec duration (ms), global size, local size, output size\n");
  619        for (const ProfilingInfo & info : profiling_info) {
  620            fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
  621                info.op_name.c_str(), info.kernel_name.c_str(),
  622                info.cmd_duration_ns/1.e6f,
  623                info.global_size[0], info.global_size[1], info.global_size[2],
  624                info.local_size[0], info.local_size[1], info.local_size[2],
  625                info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
  626        }
  627        fclose(fperf);
  628
  629        // Dump a simple chrome trace
  630        FILE* ftrace = fopen("cl_trace.json", "w");
  631        if (!ftrace) {
  632            GGML_LOG_ERROR("Failed to open cl_trace.json\n");
  633            return;
  634        }
  635
  636        fprintf(ftrace, "[\n");
  637        for (const ProfilingInfo & info : profiling_info) {
  638            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
  639                info.kernel_name.c_str(), info.cmd_queued/1000);
  640            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
  641                info.kernel_name.c_str(), info.cmd_submit/1000);
  642
  643            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
  644                info.kernel_name.c_str(), info.cmd_start/1000);
  645            fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
  646                info.kernel_name.c_str(), info.cmd_end/1000);
  647        }
  648        fclose(ftrace);
  649    }
  650
  651    size_t get_kernel_workgroup_size(cl_kernel kernel) const {
  652        size_t workgroup_size = 0;
  653        size_t ret_size = 0;
  654        CL_CHECK(
  655            clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
  656                sizeof(size_t), &workgroup_size, &ret_size));
  657        GGML_ASSERT(sizeof(size_t) == ret_size);
  658        return workgroup_size;
  659    }
  660
  661    void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
  662#ifdef GGML_OPENCL_PROFILING
  663        cl_event evt;
  664        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
  665
  666        profiling_info.emplace_back();
  667        populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
  668#else
  669        GGML_UNUSED(tensor);
  670        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
  671#endif
  672    }
  673
  674#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
  675    // Transpose kernels
  676    cl_program program_transpose;
  677
  678    cl_kernel kernel_transpose_32;
  679    cl_kernel kernel_transpose_32_16;
  680    cl_kernel kernel_transpose_16;
  681    cl_kernel kernel_transpose_16_buf;
  682    cl_kernel kernel_transpose_16_4x1;
  683
  684    // Gemm and Gemv related programs, kernels, etc
  685    cl_program program_CL_gemm;
  686    cl_program program_CL_gemv_general;
  687    cl_program program_CL_gemv_4096_1_11008;
  688    cl_program program_CL_gemv_4096_1_4096;
  689    cl_program program_CL_gemv_11008_1_4096;
  690    cl_program program_CL_gemv_32000_1_4096;
  691    cl_kernel CL_mul_mat_Ab_Bi_8x4;
  692    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
  693    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
  694    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
  695    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
  696    cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
  697    cl_kernel kernel_mul_mm_q8_0_f32_8x4;
  698    cl_kernel CL_mul_mat_vec_q8_0_f32;
  699#endif // GGML_OPENCL_USE_ADRENO_KERNELS
  700
  701    void free() {
  702        ref_count--;
  703        if (ref_count == 0) {
  704#ifdef GGML_OPENCL_PROFILING
  705            write_profiling_info();
  706            profiling_info.clear();
  707#endif
  708        }
  709    }
  710};
  711
  712// All registered devices with a default device in the front.
  713static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
  714
  715inline std::string read_file(const std::string &path) {
  716  std::ifstream ifs(path);
  717  if (!ifs) {
  718    return "";
  719  }
  720  std::string text;
  721  ifs.seekg(0, std::ios::end);
  722  text.resize(ifs.tellg());
  723  ifs.seekg(0, std::ios::beg);
  724  ifs.read(&text[0], text.size());
  725  return text;
  726}
  727
  728static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) {
  729    cl_program p;
  730    char *program_log;
  731    size_t program_size;
  732    size_t log_size;
  733    int err;
  734
  735    program_size = strlen(program_buffer);
  736
  737    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
  738    if(err < 0) {
  739        GGML_LOG_ERROR("OpenCL error creating program");
  740        exit(1);
  741    }
  742
  743    err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
  744    if(err < 0) {
  745        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
  746        program_log = (char*) malloc(log_size + 1);
  747        program_log[log_size] = '\0';
  748        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
  749        GGML_LOG_ERROR("ggml_opencl: kernel compile error:\n\n%s\n", program_log);
  750        free(program_log);
  751        exit(1);
  752    }
  753
  754    return p;
  755}
  756
  757static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_version opencl_c_version) {
  758    cl_int err;
  759
  760    // compiler options for general kernels
  761    auto opencl_c_std =
  762        std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
  763    std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
  764                               " -cl-mad-enable -cl-unsafe-math-optimizations"
  765                               " -cl-finite-math-only -cl-fast-relaxed-math";
  766
  767    GGML_LOG_INFO("ggml_opencl: loading OpenCL kernels");
  768
  769    // add
  770    {
  771#ifdef GGML_OPENCL_EMBED_KERNELS
  772        const std::string kernel_src {
  773            #include "add.cl.h"
  774        };
  775#else
  776        const std::string kernel_src = read_file("add.cl");
  777#endif
  778        backend_ctx->program_add =
  779            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  780
  781        CL_CHECK((backend_ctx->kernel_add         = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
  782        CL_CHECK((backend_ctx->kernel_add_row     = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
  783        CL_CHECK((backend_ctx->kernel_add_f16     = clCreateKernel(backend_ctx->program_add, "kernel_add_f16", &err), err));
  784        CL_CHECK((backend_ctx->kernel_add_row_f16 = clCreateKernel(backend_ctx->program_add, "kernel_add_row_f16", &err), err));
  785        GGML_LOG_CONT(".");
  786    }
  787
  788    // add_id
  789    {
  790#ifdef GGML_OPENCL_EMBED_KERNELS
  791        const std::string kernel_src {
  792            #include "add_id.cl.h"
  793        };
  794#else
  795        const std::string kernel_src = read_file("add_id.cl");
  796#endif
  797        backend_ctx->program_add_id =
  798            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  799
  800        CL_CHECK((backend_ctx->kernel_add_id = clCreateKernel(backend_ctx->program_add_id, "kernel_add_id", &err), err));
  801        GGML_LOG_CONT(".");
  802    }
  803
  804    // tri
  805    {
  806#ifdef GGML_OPENCL_EMBED_KERNELS
  807        const std::string kernel_src {
  808            #include "tri.cl.h"
  809        };
  810#else
  811        const std::string kernel_src = read_file("tri.cl");
  812#endif
  813        cl_program prog =
  814            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  815
  816        CL_CHECK((backend_ctx->kernel_tri = clCreateKernel(prog, "kernel_tri_f32", &err), err));
  817        GGML_LOG_CONT(".");
  818
  819        CL_CHECK(clReleaseProgram(prog));
  820    }
  821
  822    // fill
  823    {
  824#ifdef GGML_OPENCL_EMBED_KERNELS
  825        const std::string kernel_src {
  826            #include "fill.cl.h"
  827        };
  828#else
  829        const std::string kernel_src = read_file("fill.cl");
  830#endif
  831        cl_program prog =
  832            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  833
  834        CL_CHECK((backend_ctx->kernel_fill = clCreateKernel(prog, "kernel_fill_f32", &err), err));
  835        GGML_LOG_CONT(".");
  836
  837        CL_CHECK(clReleaseProgram(prog));
  838    }
  839
  840    // clamp
  841    {
  842#ifdef GGML_OPENCL_EMBED_KERNELS
  843        const std::string kernel_src {
  844            #include "clamp.cl.h"
  845        };
  846#else
  847        const std::string kernel_src = read_file("clamp.cl");
  848#endif
  849        backend_ctx->program_clamp =
  850            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  851
  852        CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program_clamp, "kernel_clamp", &err), err));
  853        GGML_LOG_CONT(".");
  854    }
  855
  856    // cpy
  857    {
  858#ifdef GGML_OPENCL_EMBED_KERNELS
  859        const std::string kernel_src {
  860            #include "cpy.cl.h"
  861        };
  862#else
  863        const std::string kernel_src = read_file("cpy.cl");
  864#endif
  865        backend_ctx->program_cpy =
  866            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  867
  868        CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f16", &err), err));
  869        CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f32", &err), err));
  870        CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f16", &err), err));
  871        CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f32", &err), err));
  872        GGML_LOG_CONT(".");
  873    }
  874
  875    // cvt
  876    {
  877#ifdef GGML_OPENCL_EMBED_KERNELS
  878        const std::string kernel_src {
  879            #include "cvt.cl.h"
  880        };
  881#else
  882        const std::string kernel_src = read_file("cvt.cl");
  883#endif
  884        backend_ctx->program_cvt =
  885            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  886
  887        CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
  888        CL_CHECK((backend_ctx->kernel_restore_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0_noshuffle", &err), err));
  889        CL_CHECK((backend_ctx->kernel_convert_block_q4_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
  890        CL_CHECK((backend_ctx->kernel_restore_block_q4_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
  891        CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
  892        CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err));
  893        CL_CHECK((backend_ctx->kernel_restore_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4_trans", &err), err));
  894        CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
  895        CL_CHECK((backend_ctx->kernel_convert_block_q8_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
  896        CL_CHECK((backend_ctx->kernel_restore_block_q8_0  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
  897        CL_CHECK((backend_ctx->kernel_restore_block_q8_0_trans  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0_trans", &err), err));
  898        CL_CHECK((backend_ctx->kernel_convert_block_q6_K  = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_K", &err), err));
  899        CL_CHECK((backend_ctx->kernel_restore_block_q6_K  = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_K", &err), err));
  900        GGML_LOG_CONT(".");
  901    }
  902
  903    // diag_mask_inf
  904    {
  905#ifdef GGML_OPENCL_EMBED_KERNELS
  906        const std::string kernel_src {
  907            #include "diag_mask_inf.cl.h"
  908        };
  909#else
  910        const std::string kernel_src = read_file("diag_mask_inf.cl");
  911#endif
  912        backend_ctx->program_diag_mask_inf =
  913            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  914
  915        CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program_diag_mask_inf, "kernel_diag_mask_inf_8", &err), err));
  916        CL_CHECK((backend_ctx->kernel_diag_mask_inf   = clCreateKernel(backend_ctx->program_diag_mask_inf, "kernel_diag_mask_inf", &err), err));
  917        GGML_LOG_CONT(".");
  918    }
  919
  920    // gelu
  921    {
  922#ifdef GGML_OPENCL_EMBED_KERNELS
  923        const std::string kernel_src {
  924            #include "gelu.cl.h"
  925        };
  926#else
  927        const std::string kernel_src = read_file("gelu.cl");
  928#endif
  929        backend_ctx->program_gelu =
  930            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  931
  932        CL_CHECK((backend_ctx->kernel_gelu         = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err));
  933        CL_CHECK((backend_ctx->kernel_gelu_4       = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err));
  934        CL_CHECK((backend_ctx->kernel_gelu_erf     = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf", &err), err));
  935        CL_CHECK((backend_ctx->kernel_gelu_erf_4   = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_erf_4", &err), err));
  936        CL_CHECK((backend_ctx->kernel_gelu_quick   = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err));
  937        CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err));
  938        GGML_LOG_CONT(".");
  939    }
  940
  941    // glu
  942    {
  943#ifdef GGML_OPENCL_EMBED_KERNELS
  944        const std::string kernel_src {
  945            #include "glu.cl.h"
  946        };
  947#else
  948        const std::string kernel_src = read_file("glu.cl");
  949#endif
  950        backend_ctx->program_glu =
  951            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  952
  953        CL_CHECK((backend_ctx->kernel_geglu           = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
  954        CL_CHECK((backend_ctx->kernel_reglu           = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
  955        CL_CHECK((backend_ctx->kernel_swiglu          = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
  956        CL_CHECK((backend_ctx->kernel_swiglu_oai      = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_oai", &err), err));
  957        CL_CHECK((backend_ctx->kernel_geglu_erf       = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf", &err), err));
  958        CL_CHECK((backend_ctx->kernel_geglu_quick     = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick", &err), err));
  959        CL_CHECK((backend_ctx->kernel_geglu_f16       = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
  960        CL_CHECK((backend_ctx->kernel_reglu_f16       = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
  961        CL_CHECK((backend_ctx->kernel_swiglu_f16      = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
  962        CL_CHECK((backend_ctx->kernel_geglu_erf_f16   = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_erf_f16", &err), err));
  963        CL_CHECK((backend_ctx->kernel_geglu_quick_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_quick_f16", &err), err));
  964        GGML_LOG_CONT(".");
  965    }
  966
  967    // get_rows
  968    {
  969#ifdef GGML_OPENCL_EMBED_KERNELS
  970        const std::string kernel_src {
  971            #include "get_rows.cl.h"
  972        };
  973#else
  974        const std::string kernel_src = read_file("get_rows.cl");
  975#endif
  976        backend_ctx->program_get_rows =
  977            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  978
  979        CL_CHECK((backend_ctx->kernel_get_rows_f32  = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_f32", &err), err));
  980        CL_CHECK((backend_ctx->kernel_get_rows_f16  = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_f16", &err), err));
  981        CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_q4_0", &err), err));
  982        GGML_LOG_CONT(".");
  983    }
  984
  985    // solve_tri_f32
  986    {
  987#ifdef GGML_OPENCL_EMBED_KERNELS
  988        const std::string kernel_src {
  989            #include "solve_tri.cl.h"
  990        };
  991#else
  992        const std::string kernel_src = read_file("solve_tri.cl");
  993#endif
  994        cl_program prog =
  995            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
  996
  997        CL_CHECK((backend_ctx->kernel_solve_tri_f32 = clCreateKernel(prog, "kernel_solve_tri_f32", &err), err));
  998        GGML_LOG_CONT(".");
  999        CL_CHECK(clReleaseProgram(prog));
 1000    }
 1001
 1002    // im2col_f32
 1003    {
 1004#ifdef GGML_OPENCL_EMBED_KERNELS
 1005        const std::string kernel_src {
 1006            #include "im2col_f32.cl.h"
 1007        };
 1008#else
 1009        const std::string kernel_src = read_file("im2col_f32.cl");
 1010#endif
 1011        backend_ctx->program_im2col_f32 =
 1012            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1013
 1014        CL_CHECK((backend_ctx->kernel_im2col_f32 = clCreateKernel(backend_ctx->program_im2col_f32, "kernel_im2col_f32", &err), err));
 1015        GGML_LOG_CONT(".");
 1016    }
 1017
 1018    // im2col_f16
 1019    {
 1020#ifdef GGML_OPENCL_EMBED_KERNELS
 1021        const std::string kernel_src {
 1022            #include "im2col_f16.cl.h"
 1023        };
 1024#else
 1025        const std::string kernel_src = read_file("im2col_f16.cl");
 1026#endif
 1027        backend_ctx->program_im2col_f16 =
 1028            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1029
 1030        CL_CHECK((backend_ctx->kernel_im2col_f16 = clCreateKernel(backend_ctx->program_im2col_f16, "kernel_im2col_f16", &err), err));
 1031        GGML_LOG_CONT(".");
 1032    }
 1033
 1034    // mul_mv_q4_0_f32
 1035    {
 1036#ifdef GGML_OPENCL_EMBED_KERNELS
 1037        const std::string kernel_src {
 1038            #include "mul_mv_q4_0_f32.cl.h"
 1039        };
 1040#else
 1041        const std::string kernel_src = read_file("mul_mv_q4_0_f32.cl");
 1042#endif
 1043        backend_ctx->program_mul_mv_q4_0_f32 =
 1044            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1045
 1046        CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32, "kernel_mul_mat_q4_0_f32", &err), err));
 1047        GGML_LOG_CONT(".");
 1048    }
 1049
 1050    // mul_mv_q4_0_f32_v
 1051    {
 1052#ifdef GGML_OPENCL_EMBED_KERNELS
 1053        const std::string kernel_src {
 1054            #include "mul_mv_q4_0_f32_v.cl.h"
 1055        };
 1056#else
 1057        const std::string kernel_src = read_file("mul_mv_q4_0_f32_v.cl");
 1058#endif
 1059        backend_ctx->program_mul_mv_q4_0_f32_v =
 1060            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1061
 1062        CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_v, "kernel_mul_mat_q4_0_f32_v", &err), err));
 1063        GGML_LOG_CONT(".");
 1064    }
 1065
 1066    // mul_mv_q4_0_f32_8x_flat
 1067    {
 1068#ifdef GGML_OPENCL_EMBED_KERNELS
 1069        const std::string kernel_src {
 1070            #include "mul_mv_q4_0_f32_8x_flat.cl.h"
 1071        };
 1072#else
 1073        const std::string kernel_src = read_file("mul_mv_q4_0_f32_8x_flat.cl");
 1074#endif
 1075        backend_ctx->program_mul_mv_q4_0_f32_8x_flat =
 1076            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1077
 1078        CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_8x_flat, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
 1079        GGML_LOG_CONT(".");
 1080    }
 1081
 1082    // mul_mv_q4_0_f32_1d_8x_flat
 1083    // This kernel does not compiler on Adreno cl compiler 38.01. Skip it for
 1084    // those compiler versions since it is anyway not used for Adreno.
 1085    if (backend_ctx->gpu_family != ADRENO ||
 1086        backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) ||
 1087        backend_ctx->adreno_cl_compiler_version.type == DX) {
 1088#ifdef GGML_OPENCL_EMBED_KERNELS
 1089        const std::string kernel_src {
 1090            #include "mul_mv_q4_0_f32_1d_8x_flat.cl.h"
 1091        };
 1092#else
 1093        const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_8x_flat.cl");
 1094#endif
 1095        backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat =
 1096            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1097
 1098        CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
 1099        GGML_LOG_CONT(".");
 1100    }
 1101
 1102    // mul_mv_q4_0_f32_1d_16x_flat
 1103    // This kernel does not compiler on Adreno cl compiler 38.01. Skip it for
 1104    // those compiler versions since it is anyway not used for Adreno.
 1105    if (backend_ctx->gpu_family != ADRENO ||
 1106        backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) ||
 1107    backend_ctx->adreno_cl_compiler_version.type == DX) {
 1108#ifdef GGML_OPENCL_EMBED_KERNELS
 1109        const std::string kernel_src {
 1110            #include "mul_mv_q4_0_f32_1d_16x_flat.cl.h"
 1111        };
 1112#else
 1113        const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_16x_flat.cl");
 1114#endif
 1115        backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat =
 1116            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1117
 1118        CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
 1119        GGML_LOG_CONT(".");
 1120    }
 1121
 1122    // mul_mv_q4_k_f32
 1123    {
 1124#ifdef GGML_OPENCL_EMBED_KERNELS
 1125        const std::string kernel_src {
 1126            #include "mul_mv_q4_k_f32.cl.h"
 1127        };
 1128#else
 1129        const std::string kernel_src = read_file("mul_mv_q4_k_f32.cl");
 1130#endif
 1131        cl_program prog =
 1132            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1133
 1134        CL_CHECK((backend_ctx->kernel_mul_mv_q4_K_f32 = clCreateKernel(prog, "kernel_mul_mv_q4_K_f32", &err), err));
 1135        CL_CHECK(clReleaseProgram(prog));
 1136        GGML_LOG_CONT(".");
 1137    }
 1138
 1139    // mul_mv_q6_k_f32
 1140    {
 1141#ifdef GGML_OPENCL_EMBED_KERNELS
 1142        const std::string kernel_src {
 1143            #include "mul_mv_q6_k_f32.cl.h"
 1144        };
 1145#else
 1146        const std::string kernel_src = read_file("mul_mv_q6_k_f32.cl");
 1147#endif
 1148        backend_ctx->program_mul_mv_q6_K =
 1149            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1150
 1151        CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_mul_mv_q6_K, "kernel_mul_mv_q6_K_f32", &err), err));
 1152        GGML_LOG_CONT(".");
 1153    }
 1154
 1155    // mul_mv_q6_k_f32_flat
 1156    {
 1157#ifdef GGML_OPENCL_EMBED_KERNELS
 1158        const std::string kernel_src {
 1159            #include "mul_mv_q6_k_f32_flat.cl.h"
 1160        };
 1161#else
 1162        const std::string kernel_src = read_file("mul_mv_q6_k_f32_flat.cl");
 1163#endif
 1164        cl_program prog =
 1165            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1166
 1167        CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32_flat = clCreateKernel(prog, "kernel_mul_mv_q6_K_f32_flat", &err), err));
 1168        CL_CHECK(clReleaseProgram(prog));
 1169        GGML_LOG_CONT(".");
 1170    }
 1171
 1172    // mul_mv_q8_0_f32
 1173    {
 1174#ifdef GGML_OPENCL_EMBED_KERNELS
 1175        const std::string kernel_src {
 1176            #include "mul_mv_q8_0_f32.cl.h"
 1177        };
 1178#else
 1179        const std::string kernel_src = read_file("mul_mv_q8_0_f32.cl");
 1180#endif
 1181        backend_ctx->program_mul_mv_q8_0_f32 =
 1182            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1183
 1184        CL_CHECK((backend_ctx->kernel_mul_mv_q8_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_q8_0_f32, "kernel_mul_mv_q8_0_f32", &err), err));
 1185        GGML_LOG_CONT(".");
 1186    }
 1187
 1188    // mul_mv_q8_0_f32_flat
 1189    {
 1190#ifdef GGML_OPENCL_EMBED_KERNELS
 1191        const std::string kernel_src {
 1192            #include "mul_mv_q8_0_f32_flat.cl.h"
 1193        };
 1194#else
 1195        const std::string kernel_src = read_file("mul_mv_q8_0_f32_flat.cl");
 1196#endif
 1197        backend_ctx->program_mul_mv_q8_0_f32_flat =
 1198            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1199
 1200        CL_CHECK((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_q8_0_f32_flat, "kernel_mul_mv_q8_0_f32_flat", &err), err));
 1201        GGML_LOG_CONT(".");
 1202    }
 1203
 1204    // mul_mv_mxfp4_f32
 1205    {
 1206#ifdef GGML_OPENCL_EMBED_KERNELS
 1207        const std::string kernel_src {
 1208            #include "mul_mv_mxfp4_f32.cl.h"
 1209        };
 1210#else
 1211        const std::string kernel_src = read_file("mul_mv_mxfp4_f32.cl");
 1212#endif
 1213        backend_ctx->program_mul_mv_mxfp4_f32 =
 1214            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1215
 1216        CL_CHECK((backend_ctx->kernel_mul_mv_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_mxfp4_f32, "kernel_mul_mv_mxfp4_f32", &err), err));
 1217        GGML_LOG_CONT(".");
 1218    }
 1219
 1220    // mul_mv_mxfp4_f32_flat
 1221    {
 1222#ifdef GGML_OPENCL_EMBED_KERNELS
 1223        const std::string kernel_src {
 1224            #include "mul_mv_mxfp4_f32_flat.cl.h"
 1225        };
 1226#else
 1227        const std::string kernel_src = read_file("mul_mv_mxfp4_f32_flat.cl");
 1228#endif
 1229        backend_ctx->program_mul_mv_mxfp4_f32_flat =
 1230            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1231
 1232        CL_CHECK((backend_ctx->kernel_mul_mv_mxfp4_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_mxfp4_f32_flat, "kernel_mul_mv_mxfp4_f32_flat", &err), err));
 1233        GGML_LOG_CONT(".");
 1234    }
 1235
 1236    // mul_mv_f16_f16
 1237    {
 1238#ifdef GGML_OPENCL_EMBED_KERNELS
 1239        const std::string kernel_src {
 1240            #include "mul_mv_f16_f16.cl.h"
 1241        };
 1242#else
 1243        const std::string kernel_src = read_file("mul_mv_f16_f16.cl");
 1244#endif
 1245        backend_ctx->program_mul_mv_f16_f16 =
 1246            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1247
 1248        CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program_mul_mv_f16_f16, "kernel_mul_mat_f16_f16", &err), err));
 1249        GGML_LOG_CONT(".");
 1250    }
 1251
 1252    // mul_mv_f16_f32_1row
 1253    {
 1254#ifdef GGML_OPENCL_EMBED_KERNELS
 1255        const std::string kernel_src {
 1256            #include "mul_mv_f16_f32_1row.cl.h"
 1257        };
 1258#else
 1259        const std::string kernel_src = read_file("mul_mv_f16_f32_1row.cl");
 1260#endif
 1261        backend_ctx->program_mul_mv_f16_f32_1row =
 1262            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1263
 1264        CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_1row, "kernel_mul_mat_f16_f32_1row", &err), err));
 1265        GGML_LOG_CONT(".");
 1266    }
 1267
 1268    // mul_mv_f16_f32_l4
 1269    {
 1270#ifdef GGML_OPENCL_EMBED_KERNELS
 1271        const std::string kernel_src {
 1272            #include "mul_mv_f16_f32_l4.cl.h"
 1273        };
 1274#else
 1275        const std::string kernel_src = read_file("mul_mv_f16_f32_l4.cl");
 1276#endif
 1277        backend_ctx->program_mul_mv_f16_f32_l4 =
 1278            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1279
 1280        CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4   = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err));
 1281        GGML_LOG_CONT(".");
 1282    }
 1283
 1284    // mul_mv_f16_f32
 1285    {
 1286#ifdef GGML_OPENCL_EMBED_KERNELS
 1287        const std::string kernel_src {
 1288            #include "mul_mv_f16_f32.cl.h"
 1289        };
 1290#else
 1291        const std::string kernel_src = read_file("mul_mv_f16_f32.cl");
 1292#endif
 1293        backend_ctx->program_mul_mv_f16_f32 =
 1294            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1295
 1296        CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32, "kernel_mul_mat_f16_f32", &err), err));
 1297        GGML_LOG_CONT(".");
 1298    }
 1299
 1300    // mul_mv_f32_f32
 1301    {
 1302#ifdef GGML_OPENCL_EMBED_KERNELS
 1303        const std::string kernel_src {
 1304            #include "mul_mv_f32_f32.cl.h"
 1305        };
 1306#else
 1307        const std::string kernel_src = read_file("mul_mv_f32_f32.cl");
 1308#endif
 1309        backend_ctx->program_mul_mv_f32_f32 =
 1310            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1311
 1312        CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program_mul_mv_f32_f32, "kernel_mul_mat_f32_f32", &err), err));
 1313        GGML_LOG_CONT(".");
 1314    }
 1315
 1316    // mul_mat_f16_f32_tiled
 1317    {
 1318#ifdef GGML_OPENCL_EMBED_KERNELS
 1319        const std::string kernel_src {
 1320            #include "mul_mat_f16_f32.cl.h"
 1321        };
 1322#else
 1323        const std::string kernel_src = read_file("mul_mat_f16_f32.cl");
 1324#endif
 1325        backend_ctx->program_mul_mat_f16_f32_tiled =
 1326            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1327
 1328        CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_tiled = clCreateKernel(backend_ctx->program_mul_mat_f16_f32_tiled, "mul_mat_f16_f32", &err), err));
 1329        GGML_LOG_CONT(".");
 1330    }
 1331
 1332    // mul_mm_f32_f32_l4_lm
 1333    {
 1334#ifdef GGML_OPENCL_EMBED_KERNELS
 1335        const std::string kernel_src {
 1336            #include "mul_mm_f32_f32_l4_lm.cl.h"
 1337        };
 1338#else
 1339        const std::string kernel_src = read_file("mul_mm_f32_f32_l4_lm.cl");
 1340#endif
 1341        backend_ctx->program_mul_mm_f32_f32_l4_lm =
 1342            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1343
 1344        CL_CHECK((backend_ctx->kernel_mul_mm_f32_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f32_f32_l4_lm, "kernel_mul_mm_f32_f32_l4_lm", &err), err));
 1345        GGML_LOG_CONT(".");
 1346    }
 1347
 1348    // mul_mm_f16_f32_l4_lm
 1349    {
 1350#ifdef GGML_OPENCL_EMBED_KERNELS
 1351        const std::string kernel_src {
 1352            #include "mul_mm_f16_f32_l4_lm.cl.h"
 1353        };
 1354#else
 1355        const std::string kernel_src = read_file("mul_mm_f16_f32_l4_lm.cl");
 1356#endif
 1357        backend_ctx->program_mul_mm_f16_f32_l4_lm =
 1358            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1359
 1360        CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_l4_lm, "kernel_mul_mm_f16_f32_l4_lm", &err), err));
 1361        GGML_LOG_CONT(".");
 1362    }
 1363
 1364    // mul_mm_q8_0_f32_l4_lm
 1365    {
 1366#ifdef GGML_OPENCL_EMBED_KERNELS
 1367        const std::string kernel_src {
 1368            #include "mul_mm_q8_0_f32_l4_lm.cl.h"
 1369        };
 1370#else
 1371        const std::string kernel_src = read_file("mul_mm_q8_0_f32_l4_lm.cl");
 1372#endif
 1373        backend_ctx->program_mul_mm_q8_0_f32_l4_lm =
 1374            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1375
 1376        CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_q8_0_f32_l4_lm, "kernel_mul_mm_q8_0_f32_l4_lm", &err), err));
 1377        GGML_LOG_CONT(".");
 1378    }
 1379
 1380    // mul_mm_q6_k_f32_l4_lm
 1381    {
 1382#ifdef GGML_OPENCL_EMBED_KERNELS
 1383        const std::string kernel_src {
 1384            #include "mul_mm_q6_k_f32_l4_lm.cl.h"
 1385        };
 1386#else
 1387        const std::string kernel_src = read_file("mul_mm_q6_k_f32_l4_lm.cl");
 1388#endif
 1389        cl_program prog =
 1390            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1391
 1392        CL_CHECK((backend_ctx->kernel_mul_mm_q6_k_f32_l4_lm = clCreateKernel(prog, "kernel_mul_mm_q6_k_f32_l4_lm", &err), err));
 1393        CL_CHECK(clReleaseProgram(prog));
 1394        GGML_LOG_CONT(".");
 1395    }
 1396
 1397    // mul_mm_f16_f32_kq_kqv
 1398    {
 1399#ifdef GGML_OPENCL_EMBED_KERNELS
 1400        const std::string kernel_src {
 1401            #include "mul_mm_f16_f32_kq_kqv.cl.h"
 1402        };
 1403#else
 1404        const std::string kernel_src = read_file("mul_mm_f16_f32_kq_kqv.cl");
 1405#endif
 1406        backend_ctx->program_mul_mm_f16_f32_kqv =
 1407            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts+" -DKQV ");
 1408        backend_ctx->program_mul_mm_f16_f32_kq =
 1409            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1410
 1411        CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_kqv = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_kqv, "mul_mm_f16_f32_kqv", &err), err));
 1412        CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_kq = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_kq, "mul_mm_f16_f32_kq", &err), err));
 1413        GGML_LOG_CONT(".");
 1414    }
 1415
 1416    // mul
 1417    {
 1418#ifdef GGML_OPENCL_EMBED_KERNELS
 1419        const std::string kernel_src {
 1420            #include "mul.cl.h"
 1421        };
 1422#else
 1423        const std::string kernel_src = read_file("mul.cl");
 1424#endif
 1425        backend_ctx->program_mul =
 1426            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1427
 1428        CL_CHECK((backend_ctx->kernel_mul         = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
 1429        CL_CHECK((backend_ctx->kernel_mul_row     = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
 1430        CL_CHECK((backend_ctx->kernel_mul_f16     = clCreateKernel(backend_ctx->program_mul, "kernel_mul_f16", &err), err));
 1431        CL_CHECK((backend_ctx->kernel_mul_row_f16 = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row_f16", &err), err));
 1432        GGML_LOG_CONT(".");
 1433    }
 1434
 1435    // norm
 1436    {
 1437#ifdef GGML_OPENCL_EMBED_KERNELS
 1438        const std::string kernel_src {
 1439            #include "norm.cl.h"
 1440        };
 1441#else
 1442        const std::string kernel_src = read_file("norm.cl");
 1443#endif
 1444        backend_ctx->program_norm =
 1445            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1446
 1447        CL_CHECK((backend_ctx->kernel_norm         = clCreateKernel(backend_ctx->program_norm, "kernel_norm", &err), err));
 1448        CL_CHECK((backend_ctx->kernel_norm_mul_add = clCreateKernel(backend_ctx->program_norm, "kernel_norm_mul_add", &err), err));
 1449        GGML_LOG_CONT(".");
 1450    }
 1451
 1452    // relu
 1453    {
 1454#ifdef GGML_OPENCL_EMBED_KERNELS
 1455        const std::string kernel_src {
 1456            #include "relu.cl.h"
 1457        };
 1458#else
 1459        const std::string kernel_src = read_file("relu.cl");
 1460#endif
 1461        backend_ctx->program_relu =
 1462            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1463
 1464        CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program_relu, "kernel_relu", &err), err));
 1465        GGML_LOG_CONT(".");
 1466    }
 1467
 1468    // rms_norm
 1469    {
 1470#ifdef GGML_OPENCL_EMBED_KERNELS
 1471        const std::string kernel_src {
 1472            #include "rms_norm.cl.h"
 1473        };
 1474#else
 1475        const std::string kernel_src = read_file("rms_norm.cl");
 1476#endif
 1477        backend_ctx->program_rms_norm =
 1478            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1479
 1480        CL_CHECK((backend_ctx->kernel_rms_norm     = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm", &err), err));
 1481        CL_CHECK((backend_ctx->kernel_rms_norm_mul = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm_mul", &err), err));
 1482        GGML_LOG_CONT(".");
 1483    }
 1484
 1485    // rope
 1486    {
 1487#ifdef GGML_OPENCL_EMBED_KERNELS
 1488        const std::string kernel_src {
 1489            #include "rope.cl.h"
 1490        };
 1491#else
 1492        const std::string kernel_src = read_file("rope.cl");
 1493#endif
 1494        backend_ctx->program_rope =
 1495            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1496
 1497        CL_CHECK((backend_ctx->kernel_rope_norm_f32   = clCreateKernel(backend_ctx->program_rope, "kernel_rope_norm_f32", &err), err));
 1498        CL_CHECK((backend_ctx->kernel_rope_norm_f16   = clCreateKernel(backend_ctx->program_rope, "kernel_rope_norm_f16", &err), err));
 1499        CL_CHECK((backend_ctx->kernel_rope_neox_f32   = clCreateKernel(backend_ctx->program_rope, "kernel_rope_neox_f32", &err), err));
 1500        CL_CHECK((backend_ctx->kernel_rope_neox_f16   = clCreateKernel(backend_ctx->program_rope, "kernel_rope_neox_f16", &err), err));
 1501        CL_CHECK((backend_ctx->kernel_rope_multi_f32  = clCreateKernel(backend_ctx->program_rope, "kernel_rope_multi_f32", &err), err));
 1502        CL_CHECK((backend_ctx->kernel_rope_multi_f16  = clCreateKernel(backend_ctx->program_rope, "kernel_rope_multi_f16", &err), err));
 1503        CL_CHECK((backend_ctx->kernel_rope_vision_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_vision_f32", &err), err));
 1504        CL_CHECK((backend_ctx->kernel_rope_vision_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_vision_f16", &err), err));
 1505        GGML_LOG_CONT(".");
 1506    }
 1507
 1508    // scale
 1509    {
 1510#ifdef GGML_OPENCL_EMBED_KERNELS
 1511        const std::string kernel_src {
 1512            #include "scale.cl.h"
 1513        };
 1514#else
 1515        const std::string kernel_src = read_file("scale.cl");
 1516#endif
 1517        cl_program prog =
 1518            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1519
 1520        CL_CHECK((backend_ctx->kernel_scale_f32   = clCreateKernel(prog, "kernel_scale_f32", &err), err));
 1521        CL_CHECK((backend_ctx->kernel_scale_f32_4 = clCreateKernel(prog, "kernel_scale_f32_4", &err), err));
 1522        CL_CHECK(clReleaseProgram(prog));
 1523        GGML_LOG_CONT(".");
 1524    }
 1525
 1526    // silu
 1527    {
 1528#ifdef GGML_OPENCL_EMBED_KERNELS
 1529        const std::string kernel_src {
 1530            #include "silu.cl.h"
 1531        };
 1532#else
 1533        const std::string kernel_src = read_file("silu.cl");
 1534#endif
 1535        backend_ctx->program_silu =
 1536            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1537
 1538        CL_CHECK((backend_ctx->kernel_silu   = clCreateKernel(backend_ctx->program_silu, "kernel_silu", &err), err));
 1539        CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program_silu, "kernel_silu_4", &err), err));
 1540        GGML_LOG_CONT(".");
 1541    }
 1542
 1543    // softmax_f32
 1544    {
 1545#ifdef GGML_OPENCL_EMBED_KERNELS
 1546        const std::string kernel_src {
 1547            #include "softmax_f32.cl.h"
 1548        };
 1549#else
 1550        const std::string kernel_src = read_file("softmax_f32.cl");
 1551#endif
 1552        backend_ctx->program_softmax_f32 =
 1553            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1554
 1555        CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program_softmax_f32, "kernel_soft_max", &err), err));
 1556        GGML_LOG_CONT(".");
 1557    }
 1558
 1559    // softmax_f16
 1560    {
 1561#ifdef GGML_OPENCL_EMBED_KERNELS
 1562        const std::string kernel_src {
 1563            #include "softmax_f16.cl.h"
 1564        };
 1565#else
 1566        const std::string kernel_src = read_file("softmax_f16.cl");
 1567#endif
 1568        backend_ctx->program_softmax_f16 =
 1569            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1570
 1571        CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program_softmax_f16, "kernel_soft_max_f16", &err), err));
 1572        GGML_LOG_CONT(".");
 1573    }
 1574
 1575    // softmax_4_f32
 1576    {
 1577#ifdef GGML_OPENCL_EMBED_KERNELS
 1578        const std::string kernel_src {
 1579            #include "softmax_4_f32.cl.h"
 1580        };
 1581#else
 1582        const std::string kernel_src = read_file("softmax_4_f32.cl");
 1583#endif
 1584        backend_ctx->program_softmax_4_f32 =
 1585            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1586
 1587        CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program_softmax_4_f32, "kernel_soft_max_4", &err), err));
 1588        GGML_LOG_CONT(".");
 1589    }
 1590
 1591    // softmax_4_f16
 1592    {
 1593#ifdef GGML_OPENCL_EMBED_KERNELS
 1594        const std::string kernel_src {
 1595            #include "softmax_4_f16.cl.h"
 1596        };
 1597#else
 1598        const std::string kernel_src = read_file("softmax_4_f16.cl");
 1599#endif
 1600        backend_ctx->program_softmax_4_f16 =
 1601            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1602
 1603        CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program_softmax_4_f16, "kernel_soft_max_4_f16", &err), err));
 1604        GGML_LOG_CONT(".");
 1605    }
 1606
 1607    // flash_attn
 1608    {
 1609        #ifdef GGML_OPENCL_EMBED_KERNELS
 1610                const std::string kernel_src_f16 {
 1611                    #include "flash_attn_f16.cl.h"
 1612                };
 1613                const std::string kernel_src_f32 {
 1614                    #include "flash_attn_f32.cl.h"
 1615                };
 1616                const std::string kernel_src_f32_f16 {
 1617                    #include "flash_attn_f32_f16.cl.h"
 1618                };
 1619        #else
 1620                const std::string kernel_src_f16 = read_file("flash_attn_f16.cl");
 1621                const std::string kernel_src_f32 = read_file("flash_attn_f32.cl");
 1622                const std::string kernel_src_f32_f16 = read_file("flash_attn_f32_f16.cl");
 1623        #endif
 1624
 1625        if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
 1626            const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
 1627                { 40,  40, 32, 32}, { 64,  64, 64, 64}, { 80,  80, 64, 32}, { 96,  96, 64, 32},
 1628                {112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
 1629                {192, 192, 16, 16}, {256, 256, 16, 16},
 1630            };
 1631
 1632            for (size_t i = 0; i < sizeof(fa_dims)/sizeof(fa_dims[0]); ++i) {
 1633                const int dk = fa_dims[i].dk;
 1634                const int dv = fa_dims[i].dv;
 1635                const int bm = fa_dims[i].bm;
 1636                const int bn = fa_dims[i].bn;
 1637                std::string OPTS = compile_opts +
 1638                    " -D DK=" + std::to_string(dk) +
 1639                    " -D DV=" + std::to_string(dv) +
 1640                    " -D BLOCK_M=" + std::to_string(bm) +
 1641                    " -D BLOCK_N=" + std::to_string(bn);
 1642
 1643                cl_program prog_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16.c_str(), OPTS);
 1644                cl_kernel k_f16, k_f16_q1;
 1645                CL_CHECK((k_f16 = clCreateKernel(prog_f16, "flash_attn_f16", &err), err));
 1646                CL_CHECK((k_f16_q1 = clCreateKernel(prog_f16, "flash_attn_f16_q1", &err), err));
 1647                backend_ctx->kernels_flash_attn_f16[{dk, dv}] = k_f16;
 1648                backend_ctx->kernels_flash_attn_f16_q1[{dk, dv}] = k_f16_q1;
 1649                CL_CHECK(clReleaseProgram(prog_f16));
 1650
 1651                cl_program prog_f32 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32.c_str(), OPTS);
 1652                cl_kernel k_f32, k_f32_q1;
 1653                CL_CHECK((k_f32 = clCreateKernel(prog_f32, "flash_attn_f32", &err), err));
 1654                CL_CHECK((k_f32_q1 = clCreateKernel(prog_f32, "flash_attn_f32_q1", &err), err));
 1655                backend_ctx->kernels_flash_attn_f32[{dk, dv}] = k_f32;
 1656                backend_ctx->kernels_flash_attn_f32_q1[{dk, dv}] = k_f32_q1;
 1657                CL_CHECK(clReleaseProgram(prog_f32));
 1658
 1659                cl_program prog_f32_f16 = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f32_f16.c_str(), OPTS);
 1660                cl_kernel k_f32_f16, k_f32_f16_q1;
 1661                CL_CHECK((k_f32_f16 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16", &err), err));
 1662                CL_CHECK((k_f32_f16_q1 = clCreateKernel(prog_f32_f16, "flash_attn_f32_f16_q1", &err), err));
 1663                backend_ctx->kernels_flash_attn_f32_f16[{dk, dv}] = k_f32_f16;
 1664                backend_ctx->kernels_flash_attn_f32_f16_q1[{dk, dv}] = k_f32_f16_q1;
 1665                CL_CHECK(clReleaseProgram(prog_f32_f16));
 1666
 1667                backend_ctx->kernels_flash_attn_bm[{dk, dv}] = bm;
 1668                backend_ctx->kernels_flash_attn_bn[{dk, dv}] = bn;
 1669            }
 1670            GGML_LOG_CONT(".");
 1671        }
 1672    }
 1673
 1674    // argsort
 1675    {
 1676#ifdef GGML_OPENCL_EMBED_KERNELS
 1677        const std::string kernel_src {
 1678            #include "argsort.cl.h"
 1679        };
 1680#else
 1681        const std::string kernel_src = read_file("argsort.cl");
 1682#endif
 1683        backend_ctx->program_argsort_f32_i32 =
 1684            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1685
 1686        CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
 1687        GGML_LOG_CONT(".");
 1688    }
 1689
 1690    // div
 1691    {
 1692#ifdef GGML_OPENCL_EMBED_KERNELS
 1693        const std::string kernel_src {
 1694            #include "div.cl.h"
 1695        };
 1696#else
 1697        const std::string kernel_src = read_file("div.cl");
 1698#endif
 1699        std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
 1700                               " -cl-mad-enable -cl-finite-math-only ";
 1701
 1702        backend_ctx->program_div =
 1703            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1704
 1705        CL_CHECK((backend_ctx->kernel_div         = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
 1706        CL_CHECK((backend_ctx->kernel_div_row     = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
 1707        CL_CHECK((backend_ctx->kernel_div_f16     = clCreateKernel(backend_ctx->program_div, "kernel_div_f16", &err), err));
 1708        CL_CHECK((backend_ctx->kernel_div_row_f16 = clCreateKernel(backend_ctx->program_div, "kernel_div_row_f16", &err), err));
 1709        GGML_LOG_CONT(".");
 1710    }
 1711
 1712    // sqr
 1713    {
 1714#ifdef GGML_OPENCL_EMBED_KERNELS
 1715        const std::string kernel_src {
 1716            #include "sqr.cl.h"
 1717        };
 1718#else
 1719        const std::string kernel_src = read_file("sqr.cl");
 1720#endif
 1721        cl_program prog =
 1722            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1723
 1724        CL_CHECK((backend_ctx->kernel_sqr_cont_f32     = clCreateKernel(prog, "kernel_sqr_cont_f32", &err), err));
 1725        CL_CHECK((backend_ctx->kernel_sqr_cont_f32_4   = clCreateKernel(prog, "kernel_sqr_cont_f32_4", &err), err));
 1726        CL_CHECK((backend_ctx->kernel_sqr_cont_f16     = clCreateKernel(prog, "kernel_sqr_cont_f16", &err), err));
 1727        CL_CHECK((backend_ctx->kernel_sqr_cont_f16_4   = clCreateKernel(prog, "kernel_sqr_cont_f16_4", &err), err));
 1728
 1729        CL_CHECK(clReleaseProgram(prog));
 1730        GGML_LOG_CONT(".");
 1731    }
 1732
 1733    // sqrt
 1734    {
 1735#ifdef GGML_OPENCL_EMBED_KERNELS
 1736        const std::string kernel_src {
 1737            #include "sqrt.cl.h"
 1738        };
 1739#else
 1740        const std::string kernel_src = read_file("sqrt.cl");
 1741#endif
 1742        cl_program prog =
 1743            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1744
 1745        CL_CHECK((backend_ctx->kernel_sqrt_cont_f32     = clCreateKernel(prog, "kernel_sqrt_cont_f32", &err), err));
 1746        CL_CHECK((backend_ctx->kernel_sqrt_cont_f32_4   = clCreateKernel(prog, "kernel_sqrt_cont_f32_4", &err), err));
 1747        CL_CHECK((backend_ctx->kernel_sqrt_cont_f16     = clCreateKernel(prog, "kernel_sqrt_cont_f16", &err), err));
 1748        CL_CHECK((backend_ctx->kernel_sqrt_cont_f16_4   = clCreateKernel(prog, "kernel_sqrt_cont_f16_4", &err), err));
 1749
 1750        CL_CHECK(clReleaseProgram(prog));
 1751        GGML_LOG_CONT(".");
 1752    }
 1753
 1754    // mean
 1755    {
 1756#ifdef GGML_OPENCL_EMBED_KERNELS
 1757        const std::string kernel_src {
 1758            #include "mean.cl.h"
 1759        };
 1760#else
 1761        const std::string kernel_src = read_file("mean.cl");
 1762#endif
 1763        cl_program prog =
 1764            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1765
 1766        CL_CHECK((backend_ctx->kernel_mean_f32 = clCreateKernel(prog, "kernel_mean_f32", &err), err));
 1767
 1768        CL_CHECK(clReleaseProgram(prog));
 1769        GGML_LOG_CONT(".");
 1770    }
 1771
 1772    // sub
 1773    {
 1774#ifdef GGML_OPENCL_EMBED_KERNELS
 1775        const std::string kernel_src {
 1776            #include "sub.cl.h"
 1777        };
 1778#else
 1779        const std::string kernel_src = read_file("sub.cl");
 1780#endif
 1781        backend_ctx->program_sub =
 1782            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1783
 1784        CL_CHECK((backend_ctx->kernel_sub         = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
 1785        CL_CHECK((backend_ctx->kernel_sub_row     = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
 1786        CL_CHECK((backend_ctx->kernel_sub_f16     = clCreateKernel(backend_ctx->program_sub, "kernel_sub_f16", &err), err));
 1787        CL_CHECK((backend_ctx->kernel_sub_row_f16 = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row_f16", &err), err));
 1788        GGML_LOG_CONT(".");
 1789    }
 1790
 1791    // sum_rows
 1792    {
 1793#ifdef GGML_OPENCL_EMBED_KERNELS
 1794        const std::string kernel_src {
 1795            #include "sum_rows.cl.h"
 1796        };
 1797#else
 1798        const std::string kernel_src = read_file("sum_rows.cl");
 1799#endif
 1800        backend_ctx->program_sum_rows_f32 =
 1801            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1802
 1803        CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
 1804        GGML_LOG_CONT(".");
 1805    }
 1806
 1807    // sigmoid
 1808    {
 1809#ifdef GGML_OPENCL_EMBED_KERNELS
 1810        const std::string kernel_src {
 1811            #include "sigmoid.cl.h"
 1812        };
 1813#else
 1814        const std::string kernel_src = read_file("sigmoid.cl");
 1815#endif
 1816        backend_ctx->program_sigmoid =
 1817            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1818
 1819        CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
 1820        CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
 1821        GGML_LOG_CONT(".");
 1822    }
 1823
 1824    // group_norm
 1825    {
 1826#ifdef GGML_OPENCL_EMBED_KERNELS
 1827        const std::string kernel_src {
 1828            #include "group_norm.cl.h"
 1829        };
 1830#else
 1831        const std::string kernel_src = read_file("group_norm.cl");
 1832#endif
 1833        backend_ctx->program_group_norm =
 1834            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1835
 1836        CL_CHECK((backend_ctx->kernel_group_norm         = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
 1837        CL_CHECK((backend_ctx->kernel_group_norm_mul_add = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm_mul_add", &err), err));
 1838        GGML_LOG_CONT(".");
 1839    }
 1840
 1841    // repeat
 1842    {
 1843#ifdef GGML_OPENCL_EMBED_KERNELS
 1844        const std::string kernel_src {
 1845            #include "repeat.cl.h"
 1846        };
 1847#else
 1848        const std::string kernel_src = read_file("repeat.cl");
 1849#endif
 1850        cl_program prog =
 1851            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1852        CL_CHECK((backend_ctx->kernel_repeat_f32 = clCreateKernel(prog, "kernel_repeat_f32", &err), err));
 1853        CL_CHECK(clReleaseProgram(prog));
 1854        GGML_LOG_CONT(".");
 1855    }
 1856
 1857    // pad
 1858    {
 1859#ifdef GGML_OPENCL_EMBED_KERNELS
 1860        const std::string kernel_src {
 1861            #include "pad.cl.h"
 1862        };
 1863#else
 1864        const std::string kernel_src = read_file("pad.cl");
 1865#endif
 1866        if (!kernel_src.empty()) {
 1867            backend_ctx->program_pad =
 1868                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1869            CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err));
 1870            GGML_LOG_CONT(".");
 1871        } else {
 1872            GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n");
 1873            backend_ctx->program_pad = nullptr;
 1874            backend_ctx->kernel_pad = nullptr;
 1875        }
 1876    }
 1877
 1878    // tanh
 1879    {
 1880#ifdef GGML_OPENCL_EMBED_KERNELS
 1881        const std::string kernel_src {
 1882            #include "tanh.cl.h"
 1883        };
 1884#else
 1885        const std::string kernel_src = read_file("tanh.cl");
 1886#endif
 1887        cl_program prog =
 1888            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1889        CL_CHECK((backend_ctx->kernel_tanh_f32    = clCreateKernel(prog, "kernel_tanh_f32", &err), err));
 1890        CL_CHECK((backend_ctx->kernel_tanh_f32_4  = clCreateKernel(prog, "kernel_tanh_f32_4", &err), err));
 1891        CL_CHECK((backend_ctx->kernel_tanh_f32_nc = clCreateKernel(prog, "kernel_tanh_f32_nc", &err), err));
 1892        CL_CHECK((backend_ctx->kernel_tanh_f16    = clCreateKernel(prog, "kernel_tanh_f16", &err), err));
 1893        CL_CHECK((backend_ctx->kernel_tanh_f16_4  = clCreateKernel(prog, "kernel_tanh_f16_4", &err), err));
 1894        CL_CHECK((backend_ctx->kernel_tanh_f16_nc = clCreateKernel(prog, "kernel_tanh_f16_nc", &err), err));
 1895        CL_CHECK(clReleaseProgram(prog));
 1896        GGML_LOG_CONT(".");
 1897    }
 1898
 1899    // expm1
 1900    {
 1901#ifdef GGML_OPENCL_EMBED_KERNELS
 1902        const std::string kernel_src {
 1903            #include "expm1.cl.h"
 1904        };
 1905#else
 1906        const std::string kernel_src = read_file("expm1.cl");
 1907#endif
 1908        cl_program prog;
 1909        if (!kernel_src.empty()) {
 1910            prog =
 1911                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1912            CL_CHECK((backend_ctx->kernel_expm1_f32_nd = clCreateKernel(prog, "kernel_expm1_f32_nd", &err), err));
 1913            CL_CHECK((backend_ctx->kernel_expm1_f16_nd = clCreateKernel(prog, "kernel_expm1_f16_nd", &err), err));
 1914            GGML_LOG_CONT(".");
 1915        } else {
 1916            GGML_LOG_WARN("ggml_opencl: expm1 kernel source not found or empty. Expm1 operation will not be available.\n");
 1917            prog = nullptr;
 1918            backend_ctx->kernel_expm1_f32_nd = nullptr;
 1919            backend_ctx->kernel_expm1_f16_nd = nullptr;
 1920        }
 1921        CL_CHECK(clReleaseProgram(prog));
 1922    }
 1923
 1924    // softplus
 1925    {
 1926#ifdef GGML_OPENCL_EMBED_KERNELS
 1927        const std::string kernel_src {
 1928            #include "softplus.cl.h"
 1929        };
 1930#else
 1931        const std::string kernel_src = read_file("softplus.cl");
 1932#endif
 1933        cl_program prog;
 1934        if (!kernel_src.empty()) {
 1935            prog =
 1936                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1937            CL_CHECK((backend_ctx->kernel_softplus_f32_nd = clCreateKernel(prog, "kernel_softplus_f32_nd", &err), err));
 1938            CL_CHECK((backend_ctx->kernel_softplus_f16_nd = clCreateKernel(prog, "kernel_softplus_f16_nd", &err), err));
 1939            GGML_LOG_CONT(".");
 1940        } else {
 1941            GGML_LOG_WARN("ggml_opencl: softplus kernel source not found or empty. Softplus operation will not be available.\n");
 1942            prog = nullptr;
 1943            backend_ctx->kernel_softplus_f32_nd = nullptr;
 1944            backend_ctx->kernel_softplus_f16_nd = nullptr;
 1945        }
 1946        CL_CHECK(clReleaseProgram(prog));
 1947    }
 1948
 1949    // upscale
 1950    {
 1951#ifdef GGML_OPENCL_EMBED_KERNELS
 1952        const std::string kernel_src {
 1953            #include "upscale.cl.h"
 1954        };
 1955#else
 1956        const std::string kernel_src = read_file("upscale.cl");
 1957#endif
 1958        if (!kernel_src.empty()) {
 1959            backend_ctx->program_upscale =
 1960                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1961            CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
 1962            if (backend_ctx->program_upscale) {
 1963                 cl_int err_bilinear;
 1964                 backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
 1965                 if (err_bilinear != CL_SUCCESS) {
 1966                    GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
 1967                    backend_ctx->kernel_upscale_bilinear = nullptr;
 1968                 }
 1969            } else {
 1970                backend_ctx->kernel_upscale_bilinear = nullptr;
 1971            }
 1972            GGML_LOG_CONT(".");
 1973        } else {
 1974            GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n");
 1975            backend_ctx->program_upscale = nullptr;
 1976            backend_ctx->kernel_upscale = nullptr;
 1977            backend_ctx->kernel_upscale_bilinear = nullptr;
 1978        }
 1979    }
 1980
 1981    // concat
 1982    {
 1983#ifdef GGML_OPENCL_EMBED_KERNELS
 1984        const std::string kernel_src {
 1985            #include "concat.cl.h"
 1986        };
 1987#else
 1988        const std::string kernel_src = read_file("concat.cl");
 1989#endif
 1990        cl_program prog =
 1991            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 1992        CL_CHECK((backend_ctx->kernel_concat_f32 = clCreateKernel(prog, "kernel_concat_f32", &err), err));
 1993        CL_CHECK(clReleaseProgram(prog));
 1994        GGML_LOG_CONT(".");
 1995    }
 1996
 1997    // timestep_embedding
 1998    {
 1999#ifdef GGML_OPENCL_EMBED_KERNELS
 2000        const std::string kernel_src {
 2001            #include "tsembd.cl.h"
 2002        };
 2003#else
 2004
 2005        const std::string kernel_src = read_file("tsembd.cl");
 2006#endif
 2007        if (!kernel_src.empty()) {
 2008            backend_ctx->program_tsembd =
 2009                build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 2010            CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
 2011            GGML_LOG_CONT(".");
 2012        } else {
 2013            GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
 2014            backend_ctx->program_tsembd = nullptr;
 2015            backend_ctx->kernel_timestep_embedding = nullptr;
 2016        }
 2017    }
 2018
 2019    // set_rows
 2020    {
 2021#ifdef GGML_OPENCL_EMBED_KERNELS
 2022        const std::string kernel_src {
 2023            #include "set_rows.cl.h"
 2024        };
 2025#else
 2026        const std::string kernel_src = read_file("set_rows.cl");
 2027#endif
 2028        backend_ctx->program_set_rows =
 2029            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 2030
 2031        CL_CHECK((backend_ctx->kernel_set_rows_f32_i64 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32_i64", &err), err));
 2032        CL_CHECK((backend_ctx->kernel_set_rows_f32_i32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32_i32", &err), err));
 2033        CL_CHECK((backend_ctx->kernel_set_rows_f16_i64 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16_i64", &err), err));
 2034        CL_CHECK((backend_ctx->kernel_set_rows_f16_i32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16_i32", &err), err));
 2035        GGML_LOG_CONT(".");
 2036    }
 2037
 2038     // conv2d
 2039     {
 2040        #ifdef GGML_OPENCL_EMBED_KERNELS
 2041                const std::string kernel_src {
 2042                    #include "conv2d.cl.h"
 2043                };
 2044                const std::string kernel_src_f16_f32 {
 2045                    #include "conv2d_f16_f32.cl.h"
 2046                };
 2047        #else
 2048                const std::string kernel_src = read_file("conv2d.cl");
 2049                const std::string kernel_src_f16_f32 = read_file("conv2d_f16_f32.cl");
 2050        #endif
 2051                if (!kernel_src.empty()) {
 2052                    backend_ctx->program_conv_2d_f16 =
 2053                        build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), (std::string(compile_opts) + " -DUSE_FP16=1").c_str());
 2054                    CL_CHECK((backend_ctx->kernel_conv_2d_f16 = clCreateKernel(backend_ctx->program_conv_2d_f16, "kernel_conv_2d", &err), err));
 2055                    GGML_LOG_CONT(".");
 2056                    backend_ctx->program_conv_2d_f32 =
 2057                        build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 2058                    CL_CHECK((backend_ctx->kernel_conv_2d_f32 = clCreateKernel(backend_ctx->program_conv_2d_f32, "kernel_conv_2d", &err), err));
 2059                    GGML_LOG_CONT(".");
 2060                } else {
 2061                    GGML_LOG_WARN("ggml_opencl: conv2d kernel source not found or empty. This op will not be available.\n");
 2062                    backend_ctx->program_conv_2d_f16 = nullptr;
 2063                    backend_ctx->kernel_conv_2d_f16 = nullptr;
 2064                    backend_ctx->program_conv_2d_f32 = nullptr;
 2065                    backend_ctx->kernel_conv_2d_f32 = nullptr;
 2066                }
 2067                if (!kernel_src_f16_f32.empty()) {
 2068                    backend_ctx->program_conv_2d_f16_f32 =
 2069                        build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_f16_f32.c_str(), compile_opts);
 2070                    CL_CHECK((backend_ctx->kernel_conv_2d_f16_f32 = clCreateKernel(backend_ctx->program_conv_2d_f16_f32, "kernel_conv_2d", &err), err));
 2071                    GGML_LOG_CONT(".");
 2072                } else {
 2073                    GGML_LOG_WARN("ggml_opencl: conv2d_f16_f32 kernel source not found or empty. This op will not be available.\n");
 2074                    backend_ctx->program_conv_2d_f16_f32 = nullptr;
 2075                    backend_ctx->kernel_conv_2d_f16_f32 = nullptr;
 2076                }
 2077    }
 2078
 2079    // ssm_conv
 2080    {
 2081#ifdef GGML_OPENCL_EMBED_KERNELS
 2082        const std::string kernel_src {
 2083            #include "ssm_conv.cl.h"
 2084        };
 2085#else
 2086        const std::string kernel_src = read_file("ssm_conv.cl");
 2087#endif
 2088        cl_program prog =
 2089            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 2090
 2091        CL_CHECK((backend_ctx->kernel_ssm_conv_f32_f32   = clCreateKernel(prog, "kernel_ssm_conv_f32_f32", &err), err));
 2092        CL_CHECK((backend_ctx->kernel_ssm_conv_f32_f32_4 = clCreateKernel(prog, "kernel_ssm_conv_f32_f32_4", &err), err));
 2093        CL_CHECK(clReleaseProgram(prog));
 2094        GGML_LOG_CONT(".");
 2095    }
 2096
 2097    // mul_mv_id_q4_0_f32_8x_flat
 2098    {
 2099#ifdef GGML_OPENCL_EMBED_KERNELS
 2100        const std::string kernel_src {
 2101            #include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
 2102        };
 2103#else
 2104        const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
 2105#endif
 2106        backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
 2107            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 2108
 2109        CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
 2110        GGML_LOG_CONT(".");
 2111    }
 2112
 2113    // mul_mv_id_q8_0_f32
 2114    {
 2115#ifdef GGML_OPENCL_EMBED_KERNELS
 2116        const std::string kernel_src {
 2117            #include "mul_mv_id_q8_0_f32.cl.h"
 2118        };
 2119#else
 2120        const std::string kernel_src = read_file("mul_mv_id_q8_0_f32.cl");
 2121#endif
 2122        backend_ctx->program_mul_mv_id_q8_0_f32 =
 2123            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 2124
 2125        CL_CHECK((backend_ctx->kernel_mul_mv_id_q8_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_id_q8_0_f32, "kernel_mul_mv_id_q8_0_f32", &err), err));
 2126        GGML_LOG_CONT(".");
 2127    }
 2128
 2129    // mul_mv_id_q8_0_f32_flat
 2130    {
 2131#ifdef GGML_OPENCL_EMBED_KERNELS
 2132        const std::string kernel_src {
 2133            #include "mul_mv_id_q8_0_f32_flat.cl.h"
 2134        };
 2135#else
 2136        const std::string kernel_src = read_file("mul_mv_id_q8_0_f32_flat.cl");
 2137#endif
 2138        backend_ctx->program_mul_mv_id_q8_0_f32_flat =
 2139            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 2140
 2141        CL_CHECK((backend_ctx->kernel_mul_mv_id_q8_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q8_0_f32_flat, "kernel_mul_mv_id_q8_0_f32_flat", &err), err));
 2142        GGML_LOG_CONT(".");
 2143    }
 2144
 2145    // mul_mv_id_mxfp4_f32
 2146    {
 2147#ifdef GGML_OPENCL_EMBED_KERNELS
 2148        const std::string kernel_src {
 2149            #include "mul_mv_id_mxfp4_f32.cl.h"
 2150        };
 2151#else
 2152        const std::string kernel_src = read_file("mul_mv_id_mxfp4_f32.cl");
 2153#endif
 2154        backend_ctx->program_mul_mv_id_mxfp4_f32 =
 2155            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 2156
 2157        CL_CHECK((backend_ctx->kernel_mul_mv_id_mxfp4_f32 = clCreateKernel(backend_ctx->program_mul_mv_id_mxfp4_f32, "kernel_mul_mv_id_mxfp4_f32", &err), err));
 2158        GGML_LOG_CONT(".");
 2159    }
 2160
 2161    // mul_mv_id_mxfp4_f32_flat
 2162    {
 2163#ifdef GGML_OPENCL_EMBED_KERNELS
 2164        const std::string kernel_src {
 2165            #include "mul_mv_id_mxfp4_f32_flat.cl.h"
 2166        };
 2167#else
 2168        const std::string kernel_src = read_file("mul_mv_id_mxfp4_f32_flat.cl");
 2169#endif
 2170        backend_ctx->program_mul_mv_id_mxfp4_f32_flat =
 2171            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 2172
 2173        CL_CHECK((backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_id_mxfp4_f32_flat, "kernel_mul_mv_id_mxfp4_f32_flat", &err), err));
 2174        GGML_LOG_CONT(".");
 2175    }
 2176
 2177    // Adreno kernels
 2178#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 2179    // transpose
 2180    {
 2181#ifdef GGML_OPENCL_EMBED_KERNELS
 2182        const std::string kernel_src {
 2183            #include "transpose.cl.h"
 2184        };
 2185#else
 2186        const std::string kernel_src = read_file("transpose.cl");
 2187#endif
 2188        backend_ctx->program_transpose =
 2189            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
 2190
 2191        CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
 2192        CL_CHECK((backend_ctx->kernel_transpose_32    = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
 2193        CL_CHECK((backend_ctx->kernel_transpose_16    = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
 2194        CL_CHECK((backend_ctx->kernel_transpose_16_buf = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_buf", &err), err));
 2195        CL_CHECK((backend_ctx->kernel_transpose_16_4x1 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16_4x1", &err), err));
 2196        GGML_LOG_CONT(".");
 2197    }
 2198
 2199    // gemv_noshuffle_general
 2200    {
 2201        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
 2202                                       " -cl-mad-enable "
 2203                                       " -DSIMDGROUP_WIDTH=" +
 2204                                       std::to_string(backend_ctx->adreno_wave_size);
 2205        if (backend_ctx->has_vector_subgroup_broadcast) {
 2206            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
 2207        }
 2208
 2209#ifdef GGML_OPENCL_EMBED_KERNELS
 2210        const std::string kernel_src_CL_gemv_general {
 2211            #include "gemv_noshuffle_general.cl.h"
 2212        };
 2213#else
 2214        const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general.cl");
 2215#endif
 2216
 2217        backend_ctx->program_CL_gemv_general = build_program_from_source(
 2218            backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
 2219
 2220        CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
 2221        GGML_LOG_CONT(".");
 2222    }
 2223
 2224    // gemv_noshuffle
 2225    {
 2226        // Gemv 2048, 16384
 2227        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
 2228            " -cl-mad-enable "
 2229            " -DLINE_STRIDE_A=2048 "
 2230            " -DBLOCK_STRIDE_A=16384 "
 2231            " -DSIMDGROUP_WIDTH=" +
 2232            std::to_string(backend_ctx->adreno_wave_size);
 2233        if (backend_ctx->has_vector_subgroup_broadcast) {
 2234            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
 2235        }
 2236
 2237#ifdef GGML_OPENCL_EMBED_KERNELS
 2238        const std::string kernel_src_CL_gemv {
 2239            #include "gemv_noshuffle.cl.h"
 2240        };
 2241#else
 2242        const std::string kernel_src_CL_gemv = read_file("gemv_noshuffle.cl");
 2243#endif
 2244
 2245        backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
 2246            backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
 2247        CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
 2248        GGML_LOG_CONT(".");
 2249
 2250        // Gemv 2048, 16384
 2251        CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
 2252            " -cl-mad-enable "
 2253            " -DLINE_STRIDE_A=2048 "
 2254            " -DBLOCK_STRIDE_A=16384 "
 2255            " -DSIMDGROUP_WIDTH=" +
 2256            std::to_string(backend_ctx->adreno_wave_size);
 2257        if (backend_ctx->has_vector_subgroup_broadcast) {
 2258            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
 2259        }
 2260
 2261        backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
 2262            backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
 2263        CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
 2264        GGML_LOG_CONT(".");
 2265
 2266        // Gemv 5504, 44032
 2267        CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
 2268            " -cl-mad-enable "
 2269            " -DLINE_STRIDE_A=5504 "
 2270            " -DBLOCK_STRIDE_A=44032 "
 2271            " -DSIMDGROUP_WIDTH=" +
 2272            std::to_string(backend_ctx->adreno_wave_size);
 2273        if (backend_ctx->has_vector_subgroup_broadcast) {
 2274            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
 2275        }
 2276
 2277        backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
 2278            backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
 2279        CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
 2280        GGML_LOG_CONT(".");
 2281
 2282        // Gemv 16000, 128000
 2283        CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
 2284            " -cl-mad-enable "
 2285            " -DLINE_STRIDE_A=16000 "
 2286            " -DBLOCK_STRIDE_A=128000 "
 2287            " -DSIMDGROUP_WIDTH=" +
 2288            std::to_string(backend_ctx->adreno_wave_size);
 2289
 2290        if (backend_ctx->has_vector_subgroup_broadcast) {
 2291            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
 2292        }
 2293
 2294        backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(
 2295            backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
 2296        CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
 2297        GGML_LOG_CONT(".");
 2298    }
 2299
 2300    // mul_mat_Ab_Bi_8x4
 2301    {
 2302#ifdef GGML_OPENCL_EMBED_KERNELS
 2303        const std::string kernel_src_CL_gemm {
 2304            #include "mul_mat_Ab_Bi_8x4.cl.h"
 2305        };
 2306#else
 2307        const std::string kernel_src_CL_gemm = read_file("mul_mat_Ab_Bi_8x4.cl");
 2308#endif
 2309        backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_CL_gemm.c_str(), compile_opts);
 2310        CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
 2311        GGML_LOG_CONT(".");
 2312    }
 2313
 2314    // mul_mm_q8_0_f32_8x4
 2315    {
 2316#ifdef GGML_OPENCL_EMBED_KERNELS
 2317        const std::string kernel_src_q8_8x4_gemm {
 2318            #include "mul_mm_q8_0_f32_8x4.cl.h"
 2319       };
 2320#else
 2321        const std::string kernel_src_q8_8x4_gemm = read_file("mul_mm_q8_0_f32_8x4.cl");
 2322#endif
 2323        backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_q8_8x4_gemm.c_str(), compile_opts);
 2324        CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mm_q8_0_f32_8x4", &err), err));
 2325        GGML_LOG_CONT(".");
 2326    }
 2327
 2328    // gemv_noshuffle_general_q8_0_f32
 2329    {
 2330        std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
 2331                                       " -cl-mad-enable "
 2332                                       " -DSIMDGROUP_WIDTH=" +
 2333                                       std::to_string(backend_ctx->adreno_wave_size);
 2334        if (backend_ctx->has_vector_subgroup_broadcast) {
 2335            CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
 2336        }
 2337
 2338#ifdef GGML_OPENCL_EMBED_KERNELS
 2339        const std::string kernel_src_CL_gemv_general {
 2340            #include "gemv_noshuffle_general_q8_0_f32.cl.h"
 2341        };
 2342#else
 2343        const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general_q8_0_f32.cl");
 2344#endif
 2345
 2346        cl_program prog = build_program_from_source(
 2347            backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
 2348
 2349        CL_CHECK((backend_ctx->CL_mul_mat_vec_q8_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle", &err), err));
 2350        CL_CHECK(clReleaseProgram(prog));
 2351        GGML_LOG_CONT(".");
 2352    }
 2353
 2354    std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
 2355            " -cl-mad-enable "
 2356            " -cl-fast-relaxed-math";
 2357
 2358    // gemv_moe_mxfp4_f32
 2359    {
 2360#ifdef GGML_OPENCL_EMBED_KERNELS
 2361        const std::string kernel_src {
 2362            #include "gemv_moe_mxfp4_f32.cl.h"
 2363        };
 2364#else
 2365        const std::string kernel_src = read_file("gemv_moe_mxfp4_f32.cl");
 2366#endif
 2367        backend_ctx->program_gemv_moe_mxfp4_f32 =
 2368            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
 2369
 2370        CL_CHECK((backend_ctx->kernel_gemv_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemv_moe_mxfp4_f32, "kernel_gemv_moe_mxfp4_f32", &err), err));
 2371        GGML_LOG_CONT(".");
 2372    }
 2373
 2374    // gemm_moe_mxfp4_f32
 2375    {
 2376#ifdef GGML_OPENCL_EMBED_KERNELS
 2377        const std::string kernel_src {
 2378            #include "gemm_moe_mxfp4_f32.cl.h"
 2379        };
 2380#else
 2381        const std::string kernel_src = read_file("gemm_moe_mxfp4_f32.cl");
 2382#endif
 2383        backend_ctx->program_gemm_moe_mxfp4_f32 =
 2384            build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
 2385
 2386        CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemm_moe_mxfp4_f32, "kernel_gemm_moe_mxfp4_f32", &err), err));
 2387        GGML_LOG_CONT(".");
 2388    }
 2389#endif // GGML_OPENCL_USE_ADRENO_KERNELS
 2390    GGML_LOG_CONT("\n");
 2391}
 2392
 2393// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
 2394// XXX    static bool initialized = false;
 2395// XXX    static ggml_backend_opencl_context *backend_ctx = nullptr;
 2396
 2397static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
 2398
 2399namespace /* anonymous */ {
 2400extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
 2401}
 2402
 2403// Look for available and suitable devices.
 2404static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_reg * reg) {
 2405    std::vector<ggml_backend_device> found_devices;
 2406
 2407#ifdef GGML_OPENCL_PROFILING
 2408    GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
 2409#endif
 2410
 2411    struct cl_device;
 2412    struct cl_platform {
 2413        cl_platform_id id;
 2414        unsigned number;
 2415        char name[128];
 2416        char vendor[128];
 2417        struct cl_device * devices;
 2418        unsigned n_devices;
 2419        struct cl_device * default_device;
 2420    };
 2421
 2422    struct cl_device {
 2423        struct cl_platform * platform;
 2424        cl_device_id id;
 2425        unsigned number;
 2426        cl_device_type type;
 2427        char name[128];
 2428        char version[128];
 2429    };
 2430
 2431    enum { NPLAT = 16, NDEV = 16 };
 2432
 2433    struct cl_platform platforms[NPLAT];
 2434    unsigned n_platforms = 0;
 2435    struct cl_device devices[NDEV];
 2436    unsigned n_devices = 0;
 2437    struct cl_device * default_device = NULL;
 2438    unsigned           default_platform_number = 0;
 2439
 2440    cl_platform_id platform_ids[NPLAT];
 2441    if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
 2442        GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
 2443        return found_devices;
 2444    }
 2445
 2446    for (unsigned i = 0; i < n_platforms; i++) {
 2447        struct cl_platform * p = &platforms[i];
 2448        p->number = i;
 2449        p->id = platform_ids[i];
 2450        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
 2451        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
 2452
 2453        cl_device_id device_ids[NDEV];
 2454        cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
 2455        if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
 2456            p->n_devices = 0;
 2457        } else {
 2458            CL_CHECK(clGetDeviceIDsError);
 2459        }
 2460        p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
 2461        p->default_device = NULL;
 2462
 2463        for (unsigned j = 0; j < p->n_devices; j++) {
 2464            struct cl_device * d = &devices[n_devices];
 2465            d->number = n_devices++;
 2466            d->id = device_ids[j];
 2467            d->platform = p;
 2468            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
 2469            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
 2470            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_VERSION, sizeof(d->version), &d->version, NULL));
 2471
 2472            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
 2473                p->default_device = d;
 2474            }
 2475        }
 2476
 2477        if (default_device == NULL && p->default_device != NULL) {
 2478            default_device          = p->default_device;
 2479            default_platform_number = i;
 2480        }
 2481    }
 2482
 2483    if (n_devices == 0) {
 2484        GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
 2485        return found_devices;
 2486    }
 2487
 2488    char *      user_platform_string = getenv("GGML_OPENCL_PLATFORM");
 2489    char *      user_device_string   = getenv("GGML_OPENCL_DEVICE");
 2490    int         user_platform_number = -1;
 2491    int         user_device_number   = -1;
 2492    cl_device * candidate_devices    = nullptr;
 2493    unsigned    n_candidate_devices  = 0;
 2494
 2495    unsigned n;
 2496    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
 2497        user_platform_number = (int)n;
 2498    }
 2499    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
 2500        user_device_number = (int)n;
 2501    }
 2502    if (user_platform_number != -1 && user_device_number != -1) {
 2503        cl_platform* platform = &platforms[user_platform_number];
 2504        if ((unsigned)user_device_number >= platform->n_devices) {
 2505            GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
 2506            exit(1);
 2507        }
 2508        default_device      = &platform->devices[user_device_number];
 2509        candidate_devices   = platform->devices;
 2510        n_candidate_devices = platform->n_devices;
 2511    } else {
 2512        // Choose a platform by matching a substring.
 2513        if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
 2514            for (unsigned i = 0; i < n_platforms; i++) {
 2515                struct cl_platform * p = &platforms[i];
 2516                if (strstr(p->name, user_platform_string) != NULL ||
 2517                    strstr(p->vendor, user_platform_string) != NULL) {
 2518                    user_platform_number = (int)i;
 2519                    break;
 2520                }
 2521            }
 2522            if (user_platform_number == -1) {
 2523                GGML_LOG_ERROR("ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
 2524                exit(1);
 2525            }
 2526        }
 2527
 2528        int                  platform_idx = user_platform_number != -1 ? user_platform_number : default_platform_number;
 2529        struct cl_platform * p            = &platforms[platform_idx];
 2530        candidate_devices                 = p->devices;
 2531        n_candidate_devices               = p->n_devices;
 2532        default_device                    = p->default_device;
 2533        if (n_candidate_devices == 0) {
 2534            GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
 2535            exit(1);
 2536        }
 2537
 2538        if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
 2539            for (unsigned i = 0; i < n_candidate_devices; i++) {
 2540                struct cl_device * d = &candidate_devices[i];
 2541                if (strstr(d->name, user_device_string) != NULL) {
 2542                    user_device_number = d->number;
 2543                    break;
 2544                }
 2545            }
 2546            if (user_device_number == -1) {
 2547                GGML_LOG_ERROR("ggml_opencl: no device matching '%s' was found.\n", user_device_string);
 2548                exit(1);
 2549            }
 2550        }
 2551        if (user_device_number != -1) {
 2552            candidate_devices   = &devices[user_device_number];
 2553            n_candidate_devices = 1;
 2554            default_device      = &candidate_devices[0];
 2555        }
 2556
 2557        GGML_ASSERT(n_candidate_devices > 0);
 2558
 2559        if (default_device == NULL) {
 2560            default_device = &candidate_devices[0];
 2561        }
 2562    }
 2563
 2564    GGML_ASSERT(n_candidate_devices != 0 && candidate_devices);
 2565
 2566    // Put the default device in front.
 2567    for (unsigned i = 1; i < n_candidate_devices; i++) {
 2568        if (&candidate_devices[i] == default_device) {
 2569            std::swap(candidate_devices[0], candidate_devices[i]);
 2570            default_device = &candidate_devices[0];
 2571            break;
 2572        }
 2573    }
 2574
 2575    GGML_LOG_INFO("ggml_opencl: selected platform: '%s'\n", default_device->platform->name);
 2576
 2577    std::vector<cl_device_id> device_ids;
 2578    for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
 2579        device_ids.push_back(dev->id);
 2580    }
 2581
 2582    cl_int                err;
 2583    cl_context            shared_context;
 2584    cl_context_properties properties[] = { (intptr_t) CL_CONTEXT_PLATFORM, (intptr_t) default_device->platform->id, 0 };
 2585
 2586    CL_CHECK(
 2587        (shared_context = clCreateContext(properties, device_ids.size(), device_ids.data(), NULL, NULL, &err), err));
 2588
 2589    for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
 2590        GGML_LOG_INFO("\nggml_opencl: device: '%s (%s)'\n", dev->name, dev->version);
 2591
 2592        auto dev_ctx = std::unique_ptr<ggml_backend_opencl_device_context>(new ggml_backend_opencl_device_context{
 2593            /*.platform         =*/dev->platform->id,
 2594            /*.platform_nane    =*/dev->platform->name,
 2595            /*.device           =*/dev->id,
 2596            /*.device_name      =*/dev->name,
 2597            /*.device_type      =*/dev->type,
 2598            /*.device_version   =*/dev->version,
 2599            /*.backend_ctx      =*/nullptr,
 2600            /*.buffer_type      =*/{},
 2601            /*.context          =*/shared_context,
 2602        });
 2603
 2604        found_devices.push_back(ggml_backend_device{
 2605            /* .iface   = */ ggml_backend_opencl_device_i,
 2606            /* .reg     = */ reg,
 2607            /* .context = */ dev_ctx.get(),
 2608        });
 2609
 2610        if (!ggml_cl2_init(&found_devices.back())) {
 2611            found_devices.pop_back();
 2612            GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
 2613            continue;
 2614        }
 2615
 2616        dev_ctx.release();
 2617    }
 2618
 2619    if (found_devices.size()) {
 2620        auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(found_devices.front().context);
 2621        GGML_LOG_INFO("ggml_opencl: default device: '%s (%s)'\n", dev_ctx->device_name.c_str(),
 2622                      dev_ctx->device_version.c_str());
 2623
 2624        if (dev_ctx->device_type != CL_DEVICE_TYPE_GPU) {
 2625            GGML_LOG_WARN("ggml_opencl: warning, the default device is not a GPU: '%s'.\n",
 2626                          dev_ctx->device_name.c_str());
 2627        }
 2628    }
 2629
 2630    return found_devices;
 2631}
 2632
 2633// Initialize device if it is supported (returns nullptr if it is not).
 2634static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
 2635    GGML_ASSERT(dev);
 2636    GGML_ASSERT(dev->context);
 2637
 2638    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
 2639    GGML_ASSERT(dev_ctx->platform);
 2640    GGML_ASSERT(dev_ctx->device);
 2641
 2642    if (dev_ctx->backend_ctx) {
 2643        return dev_ctx->backend_ctx;
 2644    }
 2645
 2646    auto backend_ctx        = std::make_unique<ggml_backend_opencl_context>();
 2647    backend_ctx->device     = dev_ctx->device;
 2648    backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
 2649
 2650    // ref_count get increased in ggml_backend_opencl_device_init
 2651    // This function is also used to retrieve backend context, so we don't want
 2652    // to increase ref_count for each call. We only want to increase ref_count
 2653    // when the associated device is initialized
 2654    backend_ctx->ref_count  = 0;
 2655
 2656    if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
 2657        strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
 2658        strstr(dev_ctx->device_version.c_str(), "Adreno")) {
 2659        backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
 2660        // Usually device version contains the detailed device name
 2661        backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
 2662        if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
 2663            backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
 2664        }
 2665
 2666        // Use wave size of 64 for all Adreno GPUs.
 2667        backend_ctx->adreno_wave_size = 64;
 2668    } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
 2669        backend_ctx->gpu_family = GPU_FAMILY::INTEL;
 2670    } else {
 2671        GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
 2672        backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
 2673        return nullptr;
 2674    }
 2675
 2676#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 2677    if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
 2678        GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
 2679            "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
 2680        return nullptr;
 2681    }
 2682#endif
 2683
 2684    // Populate backend device name
 2685    backend_ctx->device_name = dev_ctx->device_name;
 2686
 2687    // A local ref of cl_device_id for convenience
 2688    cl_device_id device = backend_ctx->device;
 2689
 2690    ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
 2691
 2692    // Check device OpenCL version, OpenCL 2.0 or above is required
 2693    ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
 2694    if (opencl_c_version.major < 2) {
 2695        GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
 2696        return nullptr;
 2697    }
 2698
 2699    // Check driver version
 2700    size_t driver_version_str_size;
 2701    clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, NULL, &driver_version_str_size);
 2702    char *driver_version = (char *)alloca(driver_version_str_size + 1);
 2703    clGetDeviceInfo(device, CL_DRIVER_VERSION, driver_version_str_size, driver_version, NULL);
 2704    driver_version[driver_version_str_size] = '\0';
 2705    GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
 2706    backend_ctx->driver_version = driver_version;
 2707
 2708    backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
 2709    backend_ctx->has_vector_subgroup_broadcast =
 2710        (backend_ctx->adreno_cl_compiler_version.type == E031 && backend_ctx->adreno_cl_compiler_version.major >= 47) ||
 2711        (backend_ctx->adreno_cl_compiler_version.type == DX   && backend_ctx->adreno_cl_compiler_version.major >= 17);
 2712    GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
 2713        backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
 2714
 2715    size_t ext_str_size;
 2716    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
 2717    char *ext_buffer = (char *)alloca(ext_str_size + 1);
 2718    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
 2719    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
 2720    // Check if ext_buffer contains cl_khr_fp16
 2721    backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
 2722    GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false");
 2723
 2724    // fp16 is required
 2725    if (!backend_ctx->fp16_support) {
 2726        GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
 2727        return nullptr;
 2728    }
 2729
 2730    // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
 2731    // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
 2732    if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
 2733        strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
 2734        GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
 2735            "(note that subgroups is an optional feature in OpenCL 3.0)\n");
 2736        return nullptr;
 2737    }
 2738
 2739    cl_uint base_align_in_bits;
 2740    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
 2741    GGML_ASSERT(base_align_in_bits % 8u == 0);
 2742    backend_ctx->alignment = base_align_in_bits / 8u;
 2743    GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
 2744
 2745    clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
 2746    GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024);
 2747
 2748    clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL);
 2749    GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size);
 2750
 2751    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL);
 2752    GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size);
 2753
 2754    // Check SVM.
 2755    cl_device_svm_capabilities svm_caps;
 2756    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0));
 2757    GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n",
 2758        svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false");
 2759    GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n",
 2760        svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false");
 2761    GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n",
 2762        svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false");
 2763    GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
 2764        svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
 2765
 2766    if (opencl_c_version.major >= 3) {
 2767        // Assume it is not available for 3.0, since it is optional in 3.0.
 2768        // If compiling against 3.0, then we can query.
 2769        backend_ctx->non_uniform_workgroups = false;
 2770#if CL_TARGET_OPENCL_VERSION >= 300
 2771        CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
 2772                                 &backend_ctx->non_uniform_workgroups, 0));
 2773#endif
 2774    } else {
 2775        GGML_ASSERT(opencl_c_version.major == 2);
 2776        // Non-uniform workgroup sizes is mandatory feature in v2.x.
 2777        backend_ctx->non_uniform_workgroups = true;
 2778    }
 2779
 2780    // Print out configurations
 2781#ifdef GGML_OPENCL_SOA_Q
 2782    GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
 2783#endif // GGML_OPENCL_SOA_Q
 2784
 2785#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 2786    GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
 2787#endif // GGML_OPENCL_USE_ADRENO_KERNELS
 2788
 2789    cl_int err;
 2790
 2791    // A local ref of cl_context for convenience
 2792    cl_context context = backend_ctx->context = dev_ctx->context;
 2793
 2794    //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
 2795    //    (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
 2796    //    (queue = clCreateCommandQueue(context, device, 0, &err), err)
 2797    //)));
 2798    cl_command_queue_properties command_queue_props = 0;
 2799#ifdef GGML_OPENCL_PROFILING
 2800    command_queue_props |= CL_QUEUE_PROFILING_ENABLE;
 2801#endif
 2802    CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
 2803
 2804    // Load kernels
 2805    load_cl_kernels(backend_ctx.get(), opencl_c_version);
 2806
 2807#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 2808    // Allocate intermediate buffers and images
 2809    size_t required_A_q_d_bytes = 311164928;
 2810    size_t required_A_s_d_bytes = 38895616;
 2811    size_t required_B_d_bytes = 45088768;
 2812
 2813    // Ensure buffer sizes do not exceed the maximum allocation size
 2814    size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, backend_ctx->max_alloc_size);
 2815    size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, backend_ctx->max_alloc_size);
 2816    size_t max_B_d_bytes   = MIN(required_B_d_bytes, backend_ctx->max_alloc_size);
 2817    if (required_A_q_d_bytes > backend_ctx->max_alloc_size) {
 2818        GGML_LOG_WARN("ggml_opencl: A_q_d buffer size reduced from %zu to %zu due to device limitations.\n",
 2819                      required_A_q_d_bytes, max_A_q_d_bytes);
 2820    }
 2821    if (required_A_s_d_bytes > backend_ctx->max_alloc_size) {
 2822        GGML_LOG_WARN("ggml_opencl: A_s_d buffer size reduced from %zu to %zu due to device limitations.\n",
 2823                      required_A_s_d_bytes, max_A_s_d_bytes);
 2824    }
 2825    if (required_B_d_bytes > backend_ctx->max_alloc_size) {
 2826        GGML_LOG_WARN("ggml_opencl: B_d buffer size reduced from %zu to %zu due to device limitations.\n",
 2827                      required_B_d_bytes, max_B_d_bytes);
 2828    }
 2829
 2830    backend_ctx->prealloc_quant_trans.allocate(context, max_A_q_d_bytes);
 2831    backend_ctx->prealloc_scales_trans.allocate(context, max_A_s_d_bytes);
 2832    backend_ctx->prealloc_act_trans.allocate(context, max_B_d_bytes);
 2833#endif // GGML_OPENCL_USE_ADRENO_KERNELS
 2834
 2835    backend_ctx->disable_fusion = getenv("GGML_OPENCL_DISABLE_FUSION") != nullptr;
 2836
 2837    dev_ctx->backend_ctx = backend_ctx.release();
 2838    return dev_ctx->backend_ctx;
 2839}
 2840
 2841static void ggml_cl2_free(ggml_backend_t backend) {
 2842    ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
 2843    ctx->free();
 2844
 2845    // The CL context is shared by all backends, release it if all backends have been released
 2846    bool should_release_opencl = true;
 2847    for (auto device : g_ggml_backend_opencl_devices) {
 2848        ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
 2849        if (ctx_dev->backend_ctx->ref_count > 0) {
 2850            should_release_opencl = false;
 2851        }
 2852    }
 2853
 2854    if (should_release_opencl) {
 2855        CL_CHECK(clReleaseContext(ctx->context));
 2856    }
 2857}
 2858
 2859//------------------------------------------------------------------------------
 2860// Tensor extra management
 2861//------------------------------------------------------------------------------
 2862struct ggml_tensor_extra_cl {
 2863    // The buffer object that holds the data.
 2864    cl_mem data_device;
 2865    // The offset into the buffer object. This is primarily for scratch buffer
 2866    // and view operation.
 2867    // NB: this offset no longer includes view offset (view_offs). Whenever this
 2868    // offset is used, view_offs should be considered.
 2869    cl_ulong offset;
 2870    // The actual size of the cl_mem object. This is needed when returning the
 2871    // block to the pool.
 2872    size_t actual_size;
 2873
 2874    void reset() {
 2875        data_device = nullptr;
 2876        offset = 0;
 2877        actual_size = 0;
 2878    }
 2879};
 2880
 2881// Additional tensor extra structs for quantized tensors.
 2882// These tensors are loaded from files and should not be allocated in scratch --
 2883// they should always be allocated from the pool. Hence, they do not have an
 2884// `offset`, which indicate their locations in the scratch buffer.
 2885struct ggml_tensor_extra_cl_q4_0 {
 2886    // Quantized values.
 2887    cl_mem q = nullptr;
 2888    // Quantized values in image1d_buffer_t.
 2889    cl_mem q_img = nullptr;
 2890    // Scales.
 2891    cl_mem d = nullptr;
 2892    // Scales in image1d_buffer_t.
 2893    cl_mem d_img = nullptr;
 2894    // Size of quantized values.
 2895    size_t size_q = 0;
 2896    // Size of scales.
 2897    size_t size_d = 0;
 2898
 2899    ~ggml_tensor_extra_cl_q4_0() {
 2900        reset();
 2901    }
 2902
 2903    void reset() {
 2904        // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
 2905        // They must be properly released so that the original buffer can be
 2906        // properly released to avoid memory leak.
 2907        if (q != nullptr) {
 2908            CL_CHECK(clReleaseMemObject(q));
 2909            q = nullptr;
 2910        }
 2911        if (d != nullptr) {
 2912            CL_CHECK(clReleaseMemObject(d));
 2913            d = nullptr;
 2914        }
 2915        // Currently, q_img and d_img are only initialized when SMALL_ALLOC is
 2916        // enabled. They point to the images in ggml_backend_opencl_buffer_context.
 2917        // So, there is no need to release them here.
 2918        // TODO: initialize them for non SMALL_PATH path, or remove them.
 2919        q_img = nullptr;
 2920        d_img = nullptr;
 2921        size_q = 0;
 2922        size_d = 0;
 2923    }
 2924};
 2925
 2926struct ggml_tensor_extra_cl_mxfp4 {
 2927    // Quantized values.
 2928    cl_mem q = nullptr;
 2929    // Quantized values in image1d_buffer_t.
 2930    cl_mem q_img = nullptr;
 2931    // Scales in E8M0.
 2932    cl_mem e = nullptr;
 2933    // Scales in image1d_buffer_t.
 2934    cl_mem e_img = nullptr;
 2935    // Size of quantized values.
 2936    size_t size_q = 0;
 2937    // Size of scales.
 2938    size_t size_e = 0;
 2939
 2940    ~ggml_tensor_extra_cl_mxfp4() {
 2941        reset();
 2942    }
 2943
 2944    void reset() {
 2945        // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
 2946        // They must be properly released so that the original buffer can be
 2947        // properly released to avoid memory leak.
 2948        if (q != nullptr) {
 2949            CL_CHECK(clReleaseMemObject(q));
 2950            q = nullptr;
 2951        }
 2952        if (e != nullptr) {
 2953            CL_CHECK(clReleaseMemObject(e));
 2954            e = nullptr;
 2955        }
 2956        if (q != nullptr) {
 2957            CL_CHECK(clReleaseMemObject(q_img));
 2958            q = nullptr;
 2959        }
 2960        // Currently, q_img and d_img are not used. They can be image1d_buffer_t
 2961        // that wraps around q and d to utilize image access path.
 2962        q_img = nullptr;
 2963        e_img = nullptr;
 2964        size_q = 0;
 2965        size_e = 0;
 2966    }
 2967};
 2968
 2969struct ggml_tensor_extra_cl_q8_0 {
 2970    cl_mem q = nullptr;
 2971    cl_mem q_img = nullptr;
 2972
 2973    cl_mem d = nullptr;
 2974    cl_mem d_img = nullptr;
 2975
 2976    size_t size_q = 0;
 2977    size_t size_d = 0;
 2978
 2979    ~ggml_tensor_extra_cl_q8_0() {
 2980        reset();
 2981    }
 2982
 2983    void reset() {
 2984        // q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
 2985        // They must be properly released so that the original buffer can be
 2986        // properly released to avoid memory leak.
 2987        if (q != nullptr) {
 2988            CL_CHECK(clReleaseMemObject(q));
 2989            q = nullptr;
 2990        }
 2991        if (d != nullptr) {
 2992            CL_CHECK(clReleaseMemObject(d));
 2993            d = nullptr;
 2994        }
 2995        // Currently, q_img and d_img are not used. They can be image1d_buffer_t
 2996        // that wraps around q and d to utilize image access path.
 2997        q_img = nullptr;
 2998        d_img = nullptr;
 2999        size_q = 0;
 3000        size_d = 0;
 3001    }
 3002};
 3003
 3004struct ggml_tensor_extra_cl_q6_K {
 3005    // Lower 4 bits of quantized weights.
 3006    cl_mem ql = nullptr;
 3007    // Upper 2 bits of quantized weights.
 3008    cl_mem qh = nullptr;
 3009    // Scales for each block.
 3010    cl_mem s  = nullptr;
 3011    // Scales for each super block.
 3012    cl_mem d  = nullptr;
 3013
 3014    size_t size_ql = 0;
 3015    size_t size_qh = 0;
 3016    size_t size_s  = 0;
 3017    size_t size_d  = 0;
 3018
 3019    ~ggml_tensor_extra_cl_q6_K() {
 3020        reset();
 3021    }
 3022
 3023    void reset() {
 3024        if (ql != nullptr) {
 3025            CL_CHECK(clReleaseMemObject(ql));
 3026            ql = nullptr;
 3027        }
 3028        if (qh != nullptr) {
 3029            CL_CHECK(clReleaseMemObject(qh));
 3030            qh = nullptr;
 3031        }
 3032        if (s != nullptr) {
 3033            CL_CHECK(clReleaseMemObject(s));
 3034            s = nullptr;
 3035        }
 3036        if (d != nullptr) {
 3037            CL_CHECK(clReleaseMemObject(d));
 3038            d = nullptr;
 3039        }
 3040
 3041        size_ql = 0;
 3042        size_qh = 0;
 3043        size_s  = 0;
 3044        size_d  = 0;
 3045    }
 3046};
 3047
 3048//------------------------------------------------------------------------------
 3049// Backend API
 3050//------------------------------------------------------------------------------
 3051
 3052//
 3053// backend
 3054//
 3055static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
 3056    return "OpenCL";
 3057
 3058    UNUSED(backend);
 3059}
 3060
 3061static void ggml_backend_opencl_free(ggml_backend_t backend) {
 3062    ggml_cl2_free(backend);
 3063}
 3064
 3065static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 3066    GGML_UNUSED(backend);
 3067    GGML_UNUSED(tensor);
 3068    GGML_UNUSED(data);
 3069    GGML_UNUSED(offset);
 3070    GGML_UNUSED(size);
 3071}
 3072
 3073static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 3074    GGML_UNUSED(backend);
 3075    GGML_UNUSED(tensor);
 3076    GGML_UNUSED(data);
 3077    GGML_UNUSED(offset);
 3078    GGML_UNUSED(size);
 3079}
 3080
 3081static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
 3082    GGML_UNUSED(backend);
 3083    GGML_UNUSED(src);
 3084    GGML_UNUSED(dst);
 3085    return false;
 3086}
 3087
 3088static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
 3089    auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
 3090
 3091    cl_event evt;
 3092    CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt));
 3093    CL_CHECK(clWaitForEvents(1, &evt));
 3094    CL_CHECK(clReleaseEvent(evt));
 3095}
 3096
 3097// Syncronizes the 'backend_ctx's device with others so that commands
 3098// enqueued to it won't start until commands in the other devices have
 3099// completed.
 3100static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
 3101    if (g_ggml_backend_opencl_devices.size() < 2)
 3102      return; // No other devices to synchronize with.
 3103
 3104    std::vector<cl_event> events;
 3105    events.reserve(g_ggml_backend_opencl_devices.size());
 3106
 3107    for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
 3108        auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
 3109        if (backend_ctx != other_backend_ctx) {
 3110            cl_event ev;
 3111            CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
 3112            CL_CHECK(clFlush(other_backend_ctx->queue));
 3113            events.push_back(ev);
 3114        }
 3115    }
 3116
 3117    CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, events.size(), events.data(), nullptr));
 3118    for (auto ev : events) {
 3119        CL_CHECK(clReleaseEvent(ev));
 3120    }
 3121}
 3122
 3123static void sync_with_other_backends(ggml_backend_t backend) {
 3124    auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
 3125    sync_with_other_backends(backend_ctx);
 3126}
 3127
 3128static bool ggml_opencl_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list<enum ggml_op> ops) {
 3129    if (!ggml_can_fuse(cgraph, node_idx, ops)) {
 3130        return false;
 3131    }
 3132
 3133    if (ops.size() == 2 && ops.begin()[0] == GGML_OP_RMS_NORM && ops.begin()[1] == GGML_OP_MUL) {
 3134        const ggml_tensor *rms_norm = cgraph->nodes[node_idx];
 3135        const ggml_tensor *mul      = cgraph->nodes[node_idx+1];
 3136
 3137        GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32);
 3138        GGML_ASSERT(rms_norm->type == GGML_TYPE_F32);
 3139
 3140        // rms_norm only supports f32
 3141        if (mul->src[0]->type != GGML_TYPE_F32 ||
 3142            mul->src[1]->type != GGML_TYPE_F32 ||
 3143            mul->type != GGML_TYPE_F32) {
 3144            return false;
 3145        }
 3146
 3147        // if rms_norm is the B operand, then we don't handle broadcast
 3148        if (rms_norm == mul->src[1] &&
 3149            !ggml_are_same_shape(mul->src[0], rms_norm)) {
 3150            return false;
 3151        }
 3152
 3153        // rms_norm assumes contiguous rows
 3154        if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
 3155            return false;
 3156        }
 3157    } else if (ops.size() == 3 && ops.begin()[0] == GGML_OP_NORM && ops.begin()[1] == GGML_OP_MUL && ops.begin()[2] == GGML_OP_ADD) {
 3158        const ggml_tensor *norm = cgraph->nodes[node_idx];
 3159        const ggml_tensor *mul  = cgraph->nodes[node_idx+1];
 3160        const ggml_tensor *add  = cgraph->nodes[node_idx+2];
 3161        const ggml_tensor *w    = mul->src[0] == norm ? mul->src[1] : mul->src[0];
 3162        const ggml_tensor *b    = add->src[0] == mul  ? add->src[1] : add->src[0];
 3163
 3164        // norm fusion only supports F32
 3165        if (norm->src[0]->type != GGML_TYPE_F32 || w->type != GGML_TYPE_F32 || b->type != GGML_TYPE_F32) {
 3166            return false;
 3167        }
 3168
 3169        if (norm->src[0]->ne[0] % 4 != 0) {
 3170            return false;
 3171        }
 3172
 3173        if (!ggml_is_contiguous(norm->src[0]) || !ggml_is_contiguous(w) || !ggml_is_contiguous(b)) {
 3174            return false;
 3175        }
 3176    } else if (ops.size() == 3 && ops.begin()[0] == GGML_OP_GROUP_NORM && ops.begin()[1] == GGML_OP_MUL && ops.begin()[2] == GGML_OP_ADD) {
 3177        const ggml_tensor *gn = cgraph->nodes[node_idx];
 3178        const ggml_tensor *mul = cgraph->nodes[node_idx+1];
 3179        const ggml_tensor *add = cgraph->nodes[node_idx+2];
 3180        const ggml_tensor *w   = mul->src[0] == gn ? mul->src[1] : mul->src[0];
 3181        const ggml_tensor *b   = add->src[0] == mul ? add->src[1] : add->src[0];
 3182
 3183        if (gn->src[0]->type != GGML_TYPE_F32 || w->type != GGML_TYPE_F32 || b->type != GGML_TYPE_F32) {
 3184            return false;
 3185        }
 3186
 3187        if (!ggml_is_contiguous(gn->src[0]) || !ggml_is_contiguous(w) || !ggml_is_contiguous(b)) {
 3188            return false;
 3189        }
 3190    }
 3191
 3192    return true;
 3193}
 3194
 3195static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor);
 3196static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
 3197static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor);
 3198
 3199static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
 3200    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 3201
 3202    for (int i = 0; i < cgraph->n_nodes; i++) {
 3203        ggml_tensor * node = cgraph->nodes[i];
 3204
 3205        // NOTE: this may oversynchronize by synchronizing with
 3206        //       backends/devices which don't compute 'cgraph's
 3207        //       dependencies.
 3208        sync_with_other_backends(backend);
 3209
 3210        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
 3211            continue;
 3212        }
 3213
 3214        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
 3215            continue;
 3216        }
 3217
 3218        if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_NORM, GGML_OP_MUL, GGML_OP_ADD })) {
 3219            ggml_opencl_op_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
 3220            i += 2;
 3221            continue;
 3222        }
 3223        if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_GROUP_NORM, GGML_OP_MUL, GGML_OP_ADD })) {
 3224            ggml_opencl_op_group_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
 3225            i += 2;
 3226            continue;
 3227        }
 3228        if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
 3229            ggml_opencl_op_rms_norm_fused(backend, node, cgraph->nodes[i+1]);
 3230            i++;
 3231            continue;
 3232        }
 3233
 3234        bool ok = ggml_cl_compute_forward(backend, node);
 3235        if (!ok) {
 3236            GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
 3237        }
 3238        GGML_ASSERT(ok);
 3239    }
 3240
 3241    return GGML_STATUS_SUCCESS;
 3242}
 3243
 3244static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
 3245    ggml_backend_opencl_device_context * dev_ctx     = (ggml_backend_opencl_device_context *)dev->context;
 3246    ggml_backend_opencl_context *        backend_ctx = dev_ctx->backend_ctx;
 3247
 3248    switch (op->op) {
 3249        case GGML_OP_NONE:
 3250            return true;
 3251        case GGML_OP_GET_ROWS:
 3252            switch (op->src[0]->type) {
 3253                case GGML_TYPE_F32:
 3254                case GGML_TYPE_F16:
 3255                    return true;
 3256                case GGML_TYPE_Q4_0:
 3257#ifdef GGML_OPENCL_SOA_Q
 3258                    // We do not support flattened Q4_0 (and possibly other Q's)
 3259                    return false;
 3260#else // GGML_OPENCL_SOA_Q
 3261                    return true;
 3262#endif // GGML_OPENCL_SOA_Q
 3263                default:
 3264                    return false;
 3265            }
 3266        case GGML_OP_SET_ROWS:
 3267            {
 3268                // TODO: add support
 3269                // ref: https://github.com/ggml-org/llama.cpp/pull/14274
 3270#pragma message("TODO: implement BF16, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
 3271                if (op->src[0]->type != GGML_TYPE_F32) {
 3272                    return false;
 3273                }
 3274                switch (op->type) {
 3275                    case GGML_TYPE_F16:
 3276                    case GGML_TYPE_F32:
 3277                        return (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
 3278                    default:
 3279                        return false;
 3280                }
 3281            }
 3282        case GGML_OP_CPY:
 3283        case GGML_OP_DUP:
 3284        case GGML_OP_CONT:
 3285            switch (op->src[0]->type) {
 3286                case GGML_TYPE_F32:
 3287                    switch (op->type) {
 3288                        case GGML_TYPE_F16:
 3289                        case GGML_TYPE_F32:
 3290                            return true;
 3291                        default:
 3292                            return false;
 3293                    }
 3294                case GGML_TYPE_F16:
 3295                    switch (op->type) {
 3296                        case GGML_TYPE_F16:
 3297                        case GGML_TYPE_F32:
 3298                            return true;
 3299                        default:
 3300                            return false;
 3301                    }
 3302                default:
 3303                    return false;
 3304            }
 3305        case GGML_OP_SCALE:
 3306            return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
 3307        case GGML_OP_ADD:
 3308            if (op->type == GGML_TYPE_F16) {
 3309                const bool src0_ok = op->src[0]->type == GGML_TYPE_F16 || op->src[0]->type == GGML_TYPE_F32;
 3310                const bool src1_ok = op->src[1]->type == GGML_TYPE_F16 || op->src[1]->type == GGML_TYPE_F32;
 3311                if (src0_ok && src1_ok) {
 3312                    return true;
 3313                }
 3314            }
 3315        case GGML_OP_MUL:
 3316        case GGML_OP_DIV:
 3317        case GGML_OP_SUB:
 3318            return (op->src[0]->type == op->src[1]->type) &&
 3319                   (op->src[0]->type == op->type) &&
 3320                   (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16);
 3321        case GGML_OP_ADD_ID:
 3322            return op->src[0]->type == GGML_TYPE_F32;
 3323        case GGML_OP_SQR:
 3324        case GGML_OP_SQRT:
 3325            return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
 3326                    ggml_is_contiguous(op->src[0]);
 3327        case GGML_OP_UNARY:
 3328            switch (ggml_get_unary_op(op)) {
 3329                case GGML_UNARY_OP_GELU:
 3330                case GGML_UNARY_OP_SILU:
 3331                case GGML_UNARY_OP_RELU:
 3332                case GGML_UNARY_OP_GELU_ERF:
 3333                case GGML_UNARY_OP_GELU_QUICK:
 3334                   return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
 3335                case GGML_UNARY_OP_SIGMOID:
 3336                    return ggml_is_contiguous(op->src[0]);
 3337                case GGML_UNARY_OP_TANH:
 3338                   return op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16;
 3339                case GGML_UNARY_OP_EXPM1:
 3340                   return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
 3341                          (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
 3342                case GGML_UNARY_OP_SOFTPLUS:
 3343                   return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
 3344                          (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
 3345                default:
 3346                    return false;
 3347            }
 3348        case GGML_OP_GLU:
 3349            switch (ggml_get_glu_op(op)) {
 3350                case GGML_GLU_OP_GEGLU:
 3351                case GGML_GLU_OP_REGLU:
 3352                case GGML_GLU_OP_SWIGLU:
 3353                case GGML_GLU_OP_SWIGLU_OAI:
 3354                case GGML_GLU_OP_GEGLU_ERF:
 3355                case GGML_GLU_OP_GEGLU_QUICK:
 3356                    return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
 3357                default:
 3358                    return false;
 3359            }
 3360        case GGML_OP_TRI:
 3361            return op->type == GGML_TYPE_F32 && ggml_is_contiguous(op);
 3362        case GGML_OP_FILL:
 3363            return op->type == GGML_TYPE_F32 && ggml_is_contiguous(op);
 3364        case GGML_OP_CLAMP:
 3365            return op->src[0]->type == GGML_TYPE_F32;
 3366        case GGML_OP_SOFT_MAX:
 3367        case GGML_OP_NORM:
 3368            return true;
 3369        case GGML_OP_RMS_NORM:
 3370            return op->ne[0] % 4 == 0 && ggml_is_contiguous_rows(op->src[0]);
 3371        case GGML_OP_REPEAT:
 3372            return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
 3373        case GGML_OP_PAD:
 3374            // TODO: add circular padding support for opencl, see https://github.com/ggml-org/llama.cpp/pull/16985
 3375            if (ggml_get_op_params_i32(op, 8) != 0) {
 3376                return false;
 3377            }
 3378            return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
 3379        case GGML_OP_UPSCALE: {
 3380            ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF);
 3381            const bool antialias = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & GGML_SCALE_FLAG_ANTIALIAS);
 3382            return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
 3383                   (mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR) && !antialias;
 3384        }
 3385        case GGML_OP_CONV_2D:
 3386            return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) ||
 3387                   (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
 3388                   (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
 3389        case GGML_OP_SSM_CONV:
 3390            return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32);
 3391        case GGML_OP_CONCAT:
 3392            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
 3393        case GGML_OP_TIMESTEP_EMBEDDING:
 3394            return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
 3395        case GGML_OP_GROUP_NORM:
 3396            return ggml_is_contiguous(op->src[0]);
 3397        case GGML_OP_MUL_MAT:
 3398            if (op->src[0]->type == GGML_TYPE_F16) {
 3399                return true;
 3400            } else if (op->src[0]->type == GGML_TYPE_F32) {
 3401                return op->src[1]->type == GGML_TYPE_F32;
 3402            } else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_MXFP4 ||
 3403                       op->src[0]->type == GGML_TYPE_Q4_K ||
 3404                       op->src[0]->type == GGML_TYPE_Q6_K) {
 3405                return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
 3406            } else if (op->src[0]->type == GGML_TYPE_Q8_0) {
 3407                return op->src[1]->type == GGML_TYPE_F32;
 3408            }
 3409            return false;
 3410        case GGML_OP_MUL_MAT_ID:
 3411            if (op->src[0]->type == GGML_TYPE_Q4_0 ||
 3412                op->src[0]->type == GGML_TYPE_Q8_0 ||
 3413                op->src[0]->type == GGML_TYPE_MXFP4) {
 3414                if (op->src[1]->type == GGML_TYPE_F32) {
 3415                    return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
 3416                }
 3417            }
 3418            return false;
 3419        case GGML_OP_RESHAPE:
 3420        case GGML_OP_VIEW:
 3421        case GGML_OP_PERMUTE:
 3422        case GGML_OP_TRANSPOSE:
 3423            return true;
 3424        case GGML_OP_DIAG_MASK_INF:
 3425            return op->ne[3] == 1;
 3426        case GGML_OP_ROPE: {
 3427            const int mode = ((const int32_t *) op->op_params)[2];
 3428            const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
 3429            const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
 3430            if (is_mrope && !is_vision) {
 3431                if (op->src[0]->type == GGML_TYPE_F32 ||
 3432                    op->src[0]->type == GGML_TYPE_F16) {
 3433                    return true;
 3434                }
 3435                return false;
 3436            }
 3437            if (is_vision) {
 3438                if (op->src[0]->type == GGML_TYPE_F32 ||
 3439                    op->src[0]->type == GGML_TYPE_F16) {
 3440                    return true;
 3441                }
 3442                return false;
 3443            }
 3444            return true;
 3445        }
 3446        case GGML_OP_SOLVE_TRI:
 3447            return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
 3448        case GGML_OP_IM2COL:
 3449            return true;
 3450        case GGML_OP_ARGSORT: {
 3451            cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
 3452            int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
 3453
 3454            int cols = 1;
 3455            while (cols < op->ne[0]) {
 3456                cols *= 2;
 3457            }
 3458
 3459            return cols <= max_workgroup_size && op->src[0]->type == GGML_TYPE_F32;
 3460        }
 3461        case GGML_OP_SUM_ROWS:
 3462        case GGML_OP_MEAN:
 3463            return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
 3464        case GGML_OP_FLASH_ATTN_EXT:
 3465            {
 3466                const ggml_tensor * q = op->src[0];
 3467                const ggml_tensor * k = op->src[1];
 3468                const ggml_tensor * v = op->src[2];
 3469
 3470                const int dk = q->ne[0];
 3471                const int dv = v->ne[0];
 3472
 3473                const struct { int dk; int dv; } supported_dims[] = {
 3474                    { 40,  40}, { 64,  64}, { 80,  80}, { 96,  96},
 3475                    {112, 112}, {128, 128}, {192, 128},
 3476                    {192, 192}, {256, 256},
 3477                };
 3478
 3479                bool dims_supported = false;
 3480                for (size_t i = 0; i < sizeof(supported_dims)/sizeof(supported_dims[0]); ++i) {
 3481                    if (supported_dims[i].dk == dk && supported_dims[i].dv == dv) {
 3482                        dims_supported = true;
 3483                        break;
 3484                    }
 3485                }
 3486                if (!dims_supported) {
 3487                    return false;
 3488                }
 3489
 3490                const bool is_f32_f32 = q->type == GGML_TYPE_F32 && k->type == GGML_TYPE_F32 &&
 3491                                        v->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
 3492                const bool is_f16_f16 = q->type == GGML_TYPE_F16 && k->type == GGML_TYPE_F16 &&
 3493                                        v->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16;
 3494                const bool is_f32_f16 = q->type == GGML_TYPE_F32 && k->type == GGML_TYPE_F16 &&
 3495                                        v->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F32;
 3496
 3497                return is_f32_f32 || is_f16_f16 || is_f32_f16;
 3498            }
 3499        default:
 3500            return false;
 3501    }
 3502}
 3503
 3504// Forward declaration - implementation appears later in the file.
 3505static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type);
 3506
 3507static ggml_guid_t ggml_backend_opencl_guid() {
 3508    static ggml_guid guid = { 0xde, 0xe0, 0x70, 0xa2, 0x73, 0x4e, 0x4d, 0xbc, 0xb0, 0xc7, 0x4f, 0xd4, 0x6d, 0x4e, 0x90, 0xfe };
 3509    return &guid;
 3510}
 3511
 3512static ggml_backend_i ggml_backend_opencl_i = {
 3513    /* .get_name                = */ ggml_backend_opencl_name,
 3514    /* .free                    = */ ggml_backend_opencl_free,
 3515    /* .set_tensor_async        = */ NULL,  /* ggml_backend_opencl_set_tensor_async */
 3516    /* .get_tensor_async        = */ NULL,  /* ggml_backend_opencl_get_tensor_async */
 3517    /* .cpy_tensor_async        = */ NULL,  /* ggml_backend_opencl_cpy_tensor_async */
 3518    /* .synchronize             = */ ggml_backend_opencl_synchronize,
 3519    /* .graph_plan_create       = */ NULL,
 3520    /* .graph_plan_free         = */ NULL,
 3521    /* .graph_plan_update       = */ NULL,
 3522    /* .graph_plan_compute      = */ NULL,
 3523    /* .graph_compute           = */ ggml_backend_opencl_graph_compute,
 3524    /* .event_record            = */ NULL,
 3525    /* .event_wait              = */ NULL,
 3526    /* .graph_optimize          = */ NULL,
 3527};
 3528
 3529ggml_backend_t ggml_backend_opencl_init(void) {
 3530    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0);
 3531    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
 3532
 3533    ggml_backend_t backend = new ggml_backend {
 3534        /* .guid    = */ ggml_backend_opencl_guid(),
 3535        /* .iface   = */ ggml_backend_opencl_i,
 3536        /* .device  = */ dev,
 3537        /* .context = */ backend_ctx
 3538    };
 3539
 3540    return backend;
 3541}
 3542
 3543bool ggml_backend_is_opencl(ggml_backend_t backend) {
 3544    return backend && backend->iface.get_name == ggml_backend_opencl_name;
 3545}
 3546
 3547//
 3548// buffer
 3549//
 3550struct ggml_backend_opencl_buffer_context {
 3551    // A buffer context can hold multiple cl_mem objects. This is for flattening
 3552    // quantized weights and should be used with GGML_OPENCL_SMALL_ALLOC where
 3553    // each tensor is allocated a separate buffer. When flattening is enabled
 3554    // with small allocation, each tensor is backed by two cl_mem objects (for
 3555    // quants and scales) packed into a backend_opencl_buffer.
 3556    ggml_backend_opencl_buffer_context(cl_mem buf)
 3557        : name("OpenCL") {
 3558        buffer.push_back(buf);
 3559    }
 3560
 3561    ~ggml_backend_opencl_buffer_context() {
 3562        for (cl_mem buf : buffer) {
 3563            CL_CHECK(clReleaseMemObject(buf));
 3564        }
 3565        for (cl_mem im : img) {
 3566            CL_CHECK(clReleaseMemObject(im));
 3567        }
 3568
 3569        // Delete all extras to trigger their destructors
 3570        for (ggml_tensor_extra_cl * e : temp_tensor_extras) {
 3571            delete e;
 3572        }
 3573        for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
 3574            delete e;
 3575        }
 3576        for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0) {
 3577            delete e;
 3578        }
 3579        for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
 3580            delete e;
 3581        }
 3582        for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4) {
 3583            delete e;
 3584        }
 3585        for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
 3586            delete e;
 3587        }
 3588        for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
 3589            delete e;
 3590        }
 3591        for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
 3592            delete e;
 3593        }
 3594        for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K) {
 3595            delete e;
 3596        }
 3597        for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
 3598            delete e;
 3599        }
 3600    }
 3601
 3602    ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
 3603        ggml_tensor_extra_cl * extra;
 3604        if (temp_tensor_extras.empty()) {
 3605            extra = new ggml_tensor_extra_cl();
 3606        } else {
 3607            extra = temp_tensor_extras.back();
 3608            temp_tensor_extras.pop_back();
 3609        }
 3610
 3611        temp_tensor_extras_in_use.push_back(extra);
 3612
 3613        extra->reset();
 3614        return extra;
 3615    }
 3616
 3617    ggml_tensor_extra_cl_q4_0 * ggml_opencl_alloc_temp_tensor_extra_q4_0() {
 3618        ggml_tensor_extra_cl_q4_0 * extra;
 3619        if (temp_tensor_extras_q4_0.empty()) {
 3620            extra = new ggml_tensor_extra_cl_q4_0();
 3621        } else {
 3622            extra = temp_tensor_extras_q4_0.back();
 3623            temp_tensor_extras_q4_0.pop_back();
 3624        }
 3625
 3626        temp_tensor_extras_q4_0_in_use.push_back(extra);
 3627
 3628        extra->reset();
 3629        return extra;
 3630    }
 3631
 3632    ggml_tensor_extra_cl_mxfp4 * ggml_opencl_alloc_temp_tensor_extra_mxfp4() {
 3633        ggml_tensor_extra_cl_mxfp4 * extra;
 3634        if (temp_tensor_extras_mxfp4.empty()) {
 3635            extra = new ggml_tensor_extra_cl_mxfp4();
 3636        } else {
 3637            extra = temp_tensor_extras_mxfp4.back();
 3638            temp_tensor_extras_mxfp4.pop_back();
 3639        }
 3640
 3641        temp_tensor_extras_mxfp4_in_use.push_back(extra);
 3642
 3643        extra->reset();
 3644        return extra;
 3645    }
 3646
 3647    ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0() {
 3648        ggml_tensor_extra_cl_q8_0 * extra;
 3649        if (temp_tensor_extras_q8_0.empty()) {
 3650            extra = new ggml_tensor_extra_cl_q8_0();
 3651        } else {
 3652            extra = temp_tensor_extras_q8_0.back();
 3653            temp_tensor_extras_q8_0.pop_back();
 3654        }
 3655
 3656        temp_tensor_extras_q8_0_in_use.push_back(extra);
 3657
 3658        extra->reset();
 3659        return extra;
 3660    }
 3661
 3662    ggml_tensor_extra_cl_q6_K * ggml_opencl_alloc_temp_tensor_extra_q6_K() {
 3663        ggml_tensor_extra_cl_q6_K * extra;
 3664        if (temp_tensor_extras_q6_K.empty()) {
 3665            extra = new ggml_tensor_extra_cl_q6_K();
 3666        } else {
 3667            extra = temp_tensor_extras_q6_K.back();
 3668            temp_tensor_extras_q6_K.pop_back();
 3669        }
 3670
 3671        temp_tensor_extras_q6_K_in_use.push_back(extra);
 3672
 3673        extra->reset();
 3674        return extra;
 3675    }
 3676
 3677    void reset() {
 3678        for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
 3679            temp_tensor_extras.push_back(e);
 3680        }
 3681        temp_tensor_extras_in_use.clear();
 3682
 3683        for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
 3684            temp_tensor_extras_q4_0.push_back(e);
 3685        }
 3686        temp_tensor_extras_q4_0_in_use.clear();
 3687
 3688        for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
 3689            temp_tensor_extras_mxfp4.push_back(e);
 3690        }
 3691        temp_tensor_extras_mxfp4_in_use.clear();
 3692
 3693        for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
 3694            temp_tensor_extras_q8_0.push_back(e);
 3695        }
 3696        temp_tensor_extras_q8_0_in_use.clear();
 3697
 3698        for (ggml_tensor_extra_cl_q6_K * e : temp_tensor_extras_q6_K_in_use) {
 3699            temp_tensor_extras_q6_K.push_back(e);
 3700        }
 3701        temp_tensor_extras_q6_K_in_use.clear();
 3702    }
 3703
 3704    // Pools for extras. Available extras are in `temp_tensor_extras`. Extras
 3705    // being used are in `temp_tensor_extras_in_use`. At the first run, new
 3706    // extras get created and put in `in_use`. When the buffer is reset via
 3707    // the `reset` callback, all extras in `in_use` get moved to available extras
 3708    // for reuse.
 3709    std::vector<ggml_tensor_extra_cl *> temp_tensor_extras;
 3710    std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
 3711    std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
 3712    std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
 3713    std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
 3714    std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
 3715    std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
 3716    std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
 3717    std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K;
 3718    std::vector<ggml_tensor_extra_cl_q6_K *> temp_tensor_extras_q6_K_in_use;
 3719
 3720    // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
 3721    // before any tensor is initialized (at the beginning of alloc_tensor_range).
 3722    // Hence, there is alway a buffer object in this vector. When each tensor is
 3723    // being initialized, this original buffer object will be released if both
 3724    // flattening and small allocation are enabled, and additional buffer
 3725    // objects will be created in init_tensor to represent flattened quantized
 3726    // weights.
 3727    std::vector<cl_mem> buffer;
 3728    // These are image1d_buffer_t objects that wrap around the quants and scales.
 3729    // For Q4_0 quantization, there should be two of them - one for quants and
 3730    // one for scales. They should be populated only when flattening and small
 3731    // allocation are enabled.
 3732    std::vector<cl_mem> img;
 3733    std::string name;
 3734};
 3735
 3736static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 3737    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
 3738    delete ctx;
 3739}
 3740
 3741static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
 3742    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
 3743    return (void *) (uintptr_t) backend_ctx->alignment;
 3744}
 3745
 3746static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
 3747    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
 3748
 3749    ggml_cl2_init(buffer->buft->device);
 3750
 3751    if (tensor->view_src != nullptr) {
 3752        GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
 3753
 3754        ggml_tensor_extra_cl * view_extra = (ggml_tensor_extra_cl *) tensor->view_src->extra;
 3755        GGML_ASSERT(view_extra && "view_extra is nullptr?");
 3756
 3757        // Reuse extra of the parent tensor. The offset of this view tensor
 3758        // becomes `extra->offset + view_offs` and needs to be calculated when
 3759        // it is used. This changes is needed because of the change to
 3760        // ggml_alloc.c in https://github.com/ggml-org/llama.cpp/pull/7640.
 3761        // `buffer` passed in here will always be `tensor->buffer`. It is OK
 3762        // to allocate extras from the same buffer context for ordinary
 3763        // intermediate tensors. But for views into kv cache tensors, doing so
 3764        // would mess up the extras used by kv cache.
 3765        // Before #7640, `buffer` is for intermediate tensors, which is always
 3766        // different from that of kv cache tensors.
 3767        //
 3768        // NB: now extra->offset no longer accounts for view_offs.
 3769        // NB: this should not apply to weight tensors (for end-to-end runs, but
 3770        //     may apply for test-backend-ops).
 3771        // FIXME: if any unexpected results are seen, double check the offset -
 3772        // there could be other places that need fix.
 3773        tensor->extra = view_extra;
 3774    } else {
 3775        {
 3776            size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer);
 3777
 3778            ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
 3779            extra->offset = offset;
 3780            extra->data_device = ctx->buffer[0];
 3781            extra->actual_size = ggml_nbytes(tensor);
 3782
 3783            tensor->extra = extra;
 3784        }
 3785    }
 3786    return GGML_STATUS_SUCCESS;
 3787}
 3788
 3789// The optimized gemm and gemv kernels are used for large matrices without batch.
 3790// tensor is the quantized weights matrix.
 3791inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
 3792    int64_t threshold_ne0 = 512;
 3793    int64_t threshold_ne1 = 512;
 3794    if (!backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) &&
 3795         backend_ctx->adreno_cl_compiler_version.type != DX) {
 3796        threshold_ne0 = 128;
 3797        threshold_ne1 = 128;
 3798    }
 3799    return tensor->ne[0] >= threshold_ne0 && tensor->ne[1] >= threshold_ne1 &&
 3800            tensor->ne[2] == 1 && tensor->ne[3] == 1;
 3801}
 3802
 3803inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
 3804    GGML_UNUSED(backend_ctx);
 3805    int ne01 = tensor->ne[1];
 3806    return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
 3807}
 3808
 3809inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
 3810
 3811    bool adreno_kernel = use_adreno_kernels(backend_ctx, tensor);
 3812
 3813    size_t elem_num = tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
 3814
 3815    return ((elem_num < 128 * 1024 * 1024) && adreno_kernel);  // max element num: 2**27
 3816}
 3817
 3818static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 3819    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
 3820
 3821    cl_context context = backend_ctx->context;
 3822    cl_command_queue queue = backend_ctx->queue;
 3823
 3824#ifdef GGML_OPENCL_SOA_Q
 3825    // We separate the quantized bits and scale from block_q4_0 by using an
 3826    // additional kernel, where each thread handles a block. We first read the
 3827    // original weights into a temporary buffer, then create two separate
 3828    // buffers for quantized bits and scales, which are then populated by the
 3829    // conversion kernel.
 3830    if (tensor->type == GGML_TYPE_Q4_0) {
 3831        // Tensors should have been preallocated, therefore they should
 3832        // already have ggml_tensor_extra_cl as extra.
 3833        ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
 3834        GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
 3835
 3836        // Allocate the new extra and create aliases from the original.
 3837        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
 3838        ggml_tensor_extra_cl_q4_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_0();
 3839
 3840        size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
 3841        size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
 3842        GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
 3843
 3844        cl_int err;
 3845        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
 3846            ggml_nbytes(tensor), NULL, &err);
 3847        CL_CHECK(err);
 3848        CL_CHECK(clEnqueueWriteBuffer(
 3849            queue, data_device, CL_TRUE, 0,
 3850            ggml_nbytes(tensor), data, 0, NULL, NULL));
 3851
 3852        // We consider the specified offset arg as always, although For weights
 3853        // the offset arg should be 0 (we do not assert this).
 3854        //GGML_ASSERT(offset == 0);
 3855
 3856        // We create subbuffers from the original tensor buffer for scales and
 3857        // quants - i.e., scales and quants are aliases into the buffer obejct
 3858        // that backs the original tensor. This is a cleaner way to adapt to the
 3859        // new memory management.
 3860        // In the old code, we allocate new buffers for scales and quants
 3861        // respectively, which could still be done but would result in double
 3862        // allocation; properly deallocating the preallocated buffer that backs
 3863        // the tensors is tricky and would leak the backend specific information
 3864        // into the general backend code.
 3865        // Does this create misaligned subbuffers (alignment is 1024) in certain
 3866        // cases ?
 3867        cl_buffer_region region;
 3868
 3869        // The original tensor memory is divided into scales and quants, i.e.,
 3870        // we first store scales, then quants.
 3871        // Create subbuffer for scales.
 3872        region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
 3873        region.size = size_d;
 3874        extra->d = clCreateSubBuffer(
 3875            extra_orig->data_device, CL_MEM_READ_WRITE,
 3876            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
 3877        CL_CHECK(err);
 3878        auto previous_origin = region.origin;
 3879
 3880        // Create subbuffer for quants.
 3881        region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
 3882        region.size = size_q;
 3883        extra->q = clCreateSubBuffer(
 3884            extra_orig->data_device, CL_MEM_READ_WRITE,
 3885            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
 3886        CL_CHECK(err);
 3887
 3888        //cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
 3889    #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 3890        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
 3891
 3892        // The optimized kernels need weights in natural order, so unshuffle.
 3893        if (use_adreno_kernels(backend_ctx, tensor)) {
 3894            kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
 3895        }
 3896    #else
 3897        cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
 3898    #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 3899        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
 3900        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
 3901        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
 3902
 3903        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
 3904        size_t local_work_size[] = {64, 1, 1};
 3905
 3906        cl_event evt;
 3907        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
 3908        CL_CHECK(clWaitForEvents(1, &evt));
 3909        CL_CHECK(clReleaseMemObject(data_device));
 3910
 3911        tensor->extra = extra;
 3912
 3913        // transpose the weights and scales
 3914    #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 3915        // Only do transpose for large, non batched matrix
 3916        // TODO: use preallocated images instead of sub-buffer then image
 3917        if (use_adreno_kernels(backend_ctx, tensor)) {
 3918        // <----------------------------------------------------------------------------------> //
 3919        // start transpose
 3920        // <----------------------------------------------------------------------------------> //
 3921        int M = tensor->ne[1];   // ne01
 3922        int K = tensor->ne[0];   // ne00
 3923
 3924        //For matrix-vector multiplication kernel, we assume K is a multiple of 32
 3925        GGML_ASSERT(K % 32 == 0);
 3926        //For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
 3927        GGML_ASSERT(M % 4 == 0);
 3928
 3929        // transpose is out of place, so we need to allocate transposed buffers
 3930        // <----------------------------------------------------------------------------------> //
 3931        // use sub_buffer of max buffer size instead
 3932
 3933        size_t q_size_bytes = K * M / 8 * sizeof(float);
 3934        backend_ctx->prealloc_quant_trans.allocate(context, q_size_bytes);
 3935
 3936        cl_buffer_region region;
 3937        region.origin = 0;
 3938        region.size = q_size_bytes;
 3939        cl_mem qT_d = clCreateSubBuffer(
 3940            backend_ctx->prealloc_quant_trans.buffer,
 3941            0,
 3942            CL_BUFFER_CREATE_TYPE_REGION,
 3943            &region,
 3944            &err);
 3945        CL_CHECK(err);
 3946
 3947        bool K_tile_trans = true;
 3948        if ((K / 32) % 4 != 0){
 3949            K_tile_trans =false;
 3950        }
 3951
 3952        size_t d_size_bytes = M * (K / 32) * 2;
 3953        backend_ctx->prealloc_scales_trans.allocate(context, d_size_bytes);
 3954
 3955        region.origin = 0;
 3956        region.size = d_size_bytes;
 3957        cl_mem dT_d = clCreateSubBuffer(
 3958            backend_ctx->prealloc_scales_trans.buffer,
 3959            0,
 3960            CL_BUFFER_CREATE_TYPE_REGION,
 3961            &region,
 3962            &err);
 3963        CL_CHECK(err);
 3964
 3965        // <----------------------------------------------------------------------------------> //
 3966
 3967
 3968        // create images from the buffers
 3969        // <----------------------------------------------------------------------------------> //
 3970        cl_mem q_d_image1D;
 3971        cl_mem d_d_image1D;
 3972        cl_mem qT_d_image1D;
 3973        cl_mem dT_d_image1D;
 3974
 3975        cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
 3976        cl_image_desc img_desc_1d;
 3977
 3978        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 3979        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 3980        img_desc_1d.image_width = M * K / 4 / 4;
 3981        img_desc_1d.buffer = extra->q;
 3982        q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
 3983        CL_CHECK(err);
 3984
 3985        img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
 3986        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 3987        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 3988        img_desc_1d.image_width = M * K / 4 / 4;
 3989        img_desc_1d.buffer = qT_d;
 3990        qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
 3991        CL_CHECK(err);
 3992
 3993        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 3994        if (K_tile_trans) {
 3995            img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
 3996            img_desc_1d.image_width = M * K / 32 / 4;
 3997        } else {
 3998            img_fmt_1d = { CL_R, CL_HALF_FLOAT };
 3999            img_desc_1d.image_width = M * K / 32;
 4000        }
 4001        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 4002        img_desc_1d.buffer = extra->d;
 4003        d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
 4004        CL_CHECK(err);
 4005
 4006        img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
 4007        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 4008        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 4009        img_desc_1d.image_width = M * K / 32 / 4;
 4010        img_desc_1d.buffer = dT_d;
 4011        dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
 4012        CL_CHECK(err);
 4013        // <----------------------------------------------------------------------------------> //
 4014
 4015        // set up and call the transpose kernels
 4016        // <----------------------------------------------------------------------------------> //
 4017        // weights
 4018        int height_q = M / 4;
 4019        int width_q = K / 4 / 4;
 4020        kernel = backend_ctx->kernel_transpose_16;
 4021
 4022        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
 4023        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
 4024        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_q));
 4025        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_q));
 4026
 4027        size_t local_size_q[3] = {4, 16, 1};
 4028        size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
 4029        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
 4030        CL_CHECK(clWaitForEvents(1, &evt));
 4031
 4032        // scales
 4033        int height_s = M / 4;
 4034        int width_s = K / 32 / 4;
 4035
 4036        kernel = backend_ctx->kernel_transpose_16;
 4037        if (!K_tile_trans) {
 4038            kernel = backend_ctx->kernel_transpose_16_4x1;
 4039            width_s = K / 32;
 4040        }
 4041        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
 4042        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
 4043        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
 4044        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
 4045
 4046        size_t local_size_s[3] = {4, 16, 1};
 4047        size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
 4048        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
 4049        CL_CHECK(clWaitForEvents(1, &evt));
 4050        // <----------------------------------------------------------------------------------> //
 4051
 4052        // copy transposed buffer contents to original buffers
 4053        // <----------------------------------------------------------------------------------> //
 4054        // weights
 4055        CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
 4056        CL_CHECK(clWaitForEvents(1, &evt));
 4057
 4058        // scales
 4059        CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
 4060        CL_CHECK(clWaitForEvents(1, &evt));
 4061        // <----------------------------------------------------------------------------------> //
 4062
 4063        // deallocate transpose buffers
 4064        // <----------------------------------------------------------------------------------> //
 4065        CL_CHECK(clReleaseMemObject(qT_d));
 4066        CL_CHECK(clReleaseMemObject(dT_d));
 4067
 4068        // deallocate temporary images
 4069        CL_CHECK(clReleaseMemObject(q_d_image1D));
 4070        CL_CHECK(clReleaseMemObject(d_d_image1D));
 4071        CL_CHECK(clReleaseMemObject(qT_d_image1D));
 4072        CL_CHECK(clReleaseMemObject(dT_d_image1D));
 4073        // <----------------------------------------------------------------------------------> //
 4074        // end transpose
 4075        // <----------------------------------------------------------------------------------> //
 4076        }
 4077    #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 4078
 4079        return;
 4080
 4081    }
 4082    if (tensor->type == GGML_TYPE_MXFP4) {
 4083        ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
 4084        GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
 4085
 4086        // Allocate the new extra and create aliases from the original.
 4087        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
 4088        ggml_tensor_extra_cl_mxfp4 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_mxfp4();
 4089
 4090        size_t size_e = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(char);
 4091        size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
 4092        GGML_ASSERT(size_e + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
 4093
 4094        cl_int err;
 4095        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
 4096            ggml_nbytes(tensor), NULL, &err);
 4097        CL_CHECK(err);
 4098        CL_CHECK(clEnqueueWriteBuffer(
 4099            queue, data_device, CL_TRUE, 0,
 4100            ggml_nbytes(tensor), data, 0, NULL, NULL));
 4101
 4102        // The original tensor memory is divided into scales and quants, i.e.,
 4103        // we first store scales, then quants.
 4104        cl_buffer_region region;
 4105
 4106        // Create subbuffer for scales.
 4107        region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
 4108        region.size = size_e;
 4109        extra->e = clCreateSubBuffer(
 4110            extra_orig->data_device, CL_MEM_READ_WRITE,
 4111            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
 4112        CL_CHECK(err);
 4113        auto previous_origin = region.origin;
 4114
 4115        // Create subbuffer for quants.
 4116        region.origin = align_to(previous_origin + size_e, backend_ctx->alignment);
 4117        region.size = size_q;
 4118        extra->q = clCreateSubBuffer(
 4119            extra_orig->data_device, CL_MEM_READ_WRITE,
 4120            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
 4121        CL_CHECK(err);
 4122
 4123#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 4124        if (use_adreno_moe_kernels(backend_ctx, tensor)) {
 4125            cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans;
 4126
 4127            int ne00 = tensor->ne[0];
 4128            int ne01 = tensor->ne[1];
 4129            int ne02 = tensor->ne[2];
 4130            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
 4131            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
 4132            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
 4133            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
 4134            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
 4135
 4136            size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
 4137            size_t local_work_size[3] = {64, 2, 1};
 4138
 4139            cl_event evt;
 4140            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
 4141            CL_CHECK(clWaitForEvents(1, &evt));
 4142            CL_CHECK(clReleaseMemObject(data_device));
 4143            tensor->extra = extra;
 4144
 4145            return;
 4146        }
 4147#endif
 4148        cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4;
 4149
 4150        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
 4151        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
 4152        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
 4153
 4154        size_t global_work_size[3] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
 4155        size_t local_work_size[3] = {64, 1, 1};
 4156
 4157        cl_event evt;
 4158        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
 4159        CL_CHECK(clWaitForEvents(1, &evt));
 4160        CL_CHECK(clReleaseMemObject(data_device));
 4161
 4162        // Create image for Q
 4163        cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32};
 4164        cl_image_desc img_desc_q = {
 4165            CL_MEM_OBJECT_IMAGE1D_BUFFER,
 4166            static_cast<size_t>(ggml_nelements(tensor)/32*2),
 4167            0, 0, 0, 0, 0, 0, 0,
 4168            { extra->q }
 4169        };
 4170        extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
 4171        tensor->extra = extra;
 4172
 4173        return;
 4174    }
 4175    if (tensor->type == GGML_TYPE_Q8_0) {
 4176        ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
 4177        GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
 4178
 4179        // Allocate the new extra and create aliases from the original.
 4180        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
 4181        ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0();
 4182
 4183        size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
 4184        size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)*sizeof(char));
 4185        GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
 4186
 4187        cl_int err;
 4188        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
 4189            ggml_nbytes(tensor), NULL, &err);
 4190        CL_CHECK(err);
 4191        CL_CHECK(clEnqueueWriteBuffer(
 4192            queue, data_device, CL_TRUE, 0,
 4193            ggml_nbytes(tensor), data, 0, NULL, NULL));
 4194
 4195        // The original tensor memory is divided into scales and quants, i.e.,
 4196        // we first store scales, then quants.
 4197        cl_buffer_region region;
 4198
 4199        // Create subbuffer for scales.
 4200        region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
 4201        region.size = size_d;
 4202        extra->d = clCreateSubBuffer(
 4203            extra_orig->data_device, CL_MEM_READ_WRITE,
 4204            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
 4205        CL_CHECK(err);
 4206        auto previous_origin = region.origin;
 4207
 4208        // Create subbuffer for quants.
 4209        region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
 4210        region.size = size_q;
 4211        extra->q = clCreateSubBuffer(
 4212            extra_orig->data_device, CL_MEM_READ_WRITE,
 4213            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
 4214        CL_CHECK(err);
 4215
 4216        cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0;
 4217
 4218        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
 4219        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
 4220        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
 4221
 4222        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
 4223        size_t local_work_size[] = {64, 1, 1};
 4224
 4225        cl_event evt;
 4226        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
 4227        CL_CHECK(clWaitForEvents(1, &evt));
 4228        CL_CHECK(clReleaseMemObject(data_device));
 4229
 4230        tensor->extra = extra;
 4231
 4232        // Transpose the weights and scales
 4233#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 4234        if (enable_adreno_trans_weight(backend_ctx, tensor)) {
 4235
 4236            int M = tensor->ne[1];   // ne01
 4237            int K = tensor->ne[0];   // ne00
 4238
 4239            GGML_ASSERT(K % 32 == 0);
 4240            GGML_ASSERT(M % 4 == 0);
 4241            GGML_ASSERT(tensor->ne[2] == 1);
 4242            GGML_ASSERT(tensor->ne[3] == 1);
 4243
 4244            // Transpose weights
 4245            size_t q_size_bytes = K * M / 4 * sizeof(float);
 4246            cl_buffer_region region;
 4247            region.origin = 0;
 4248            region.size = q_size_bytes;
 4249            cl_mem qT_d = clCreateSubBuffer(
 4250                backend_ctx->prealloc_quant_trans.buffer,
 4251                0,
 4252                CL_BUFFER_CREATE_TYPE_REGION,
 4253                &region,
 4254                &err);
 4255            CL_CHECK(err);
 4256
 4257            cl_mem q_d_image1D;
 4258            cl_mem qT_d_image1D;
 4259
 4260            cl_image_format img_fmt_1d;
 4261            cl_image_desc img_desc_1d;
 4262
 4263            img_fmt_1d = { CL_RGBA, CL_FLOAT };
 4264            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 4265            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 4266            img_desc_1d.image_width = M * K / 4 / 4;
 4267            img_desc_1d.buffer = extra->q;
 4268            q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
 4269            CL_CHECK(err);
 4270
 4271            img_fmt_1d = { CL_RGBA, CL_FLOAT };
 4272            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 4273            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 4274            img_desc_1d.image_width = M * K / 4 / 4;
 4275            img_desc_1d.buffer = qT_d;
 4276            qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
 4277            CL_CHECK(err);
 4278
 4279            int height_q = M / 4;
 4280            int width_q = K / 4 / 4;
 4281            kernel = backend_ctx->kernel_transpose_32;
 4282
 4283            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
 4284            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D));
 4285            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_q));
 4286            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_q));
 4287
 4288            size_t local_size_q[3] = {4, 16, 1};
 4289            size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1};
 4290            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt));
 4291            CL_CHECK(clWaitForEvents(1, &evt));
 4292
 4293            // Transpose scales
 4294            size_t d_size_bytes = M * (K / 32) * 2;
 4295            region.origin = 0;
 4296            region.size = d_size_bytes;
 4297            cl_mem dT_d = clCreateSubBuffer(
 4298                backend_ctx->prealloc_scales_trans.buffer,
 4299                0,
 4300                CL_BUFFER_CREATE_TYPE_REGION,
 4301                &region,
 4302                &err);
 4303            CL_CHECK(err);
 4304
 4305            cl_mem d_d_image1D;
 4306            cl_mem dT_d_image1D;
 4307
 4308            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 4309            img_fmt_1d = { CL_R, CL_HALF_FLOAT };
 4310            img_desc_1d.image_width = M * K / 32;
 4311            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 4312            img_desc_1d.buffer = extra->d;
 4313            d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
 4314            CL_CHECK(err);
 4315
 4316            img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
 4317            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 4318            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 4319            img_desc_1d.image_width = M * K / 32 / 4;
 4320            img_desc_1d.buffer = dT_d;
 4321            dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
 4322            CL_CHECK(err);
 4323
 4324            int height_s = M / 4;
 4325            int width_s = K / 32;
 4326
 4327            kernel = backend_ctx->kernel_transpose_16_4x1;
 4328
 4329            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
 4330            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D));
 4331            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s));
 4332            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s));
 4333
 4334            size_t local_size_s[3] = {4, 16, 1};
 4335            size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1};
 4336            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt));
 4337            CL_CHECK(clWaitForEvents(1, &evt));
 4338
 4339            // copy transposed buffer contents to original buffers
 4340            CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt));
 4341            CL_CHECK(clWaitForEvents(1, &evt));
 4342
 4343            CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt));
 4344            CL_CHECK(clWaitForEvents(1, &evt));
 4345
 4346            CL_CHECK(clReleaseMemObject(qT_d));
 4347            CL_CHECK(clReleaseMemObject(dT_d));
 4348
 4349            CL_CHECK(clReleaseMemObject(q_d_image1D));
 4350            CL_CHECK(clReleaseMemObject(d_d_image1D));
 4351            CL_CHECK(clReleaseMemObject(qT_d_image1D));
 4352            CL_CHECK(clReleaseMemObject(dT_d_image1D));
 4353        } // end transpose
 4354#endif // GGML_OPENCL_USE_ADRENO_KERNELS
 4355
 4356        return;
 4357    }
 4358    if (tensor->type == GGML_TYPE_Q6_K) {
 4359        ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
 4360        GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
 4361
 4362        // Allocate the new extra and create aliases from the original.
 4363        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
 4364        ggml_tensor_extra_cl_q6_K * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q6_K();
 4365
 4366        size_t size_ql = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
 4367        size_t size_qh = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/4;
 4368        size_t size_s  = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/16;
 4369        size_t size_d  = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
 4370        GGML_ASSERT(size_ql + size_qh + size_s + size_d == ggml_nbytes(tensor) &&
 4371            "Incorrect tensor size");
 4372
 4373        cl_int err;
 4374        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
 4375            ggml_nbytes(tensor), NULL, &err);
 4376        CL_CHECK(err);
 4377        CL_CHECK(clEnqueueWriteBuffer(
 4378            queue, data_device, CL_TRUE, 0,
 4379            ggml_nbytes(tensor), data, 0, NULL, NULL));
 4380
 4381        cl_buffer_region region;
 4382
 4383        // Subbuffer for ql
 4384        region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
 4385        region.size = size_ql;
 4386        extra->ql = clCreateSubBuffer(
 4387            extra_orig->data_device, CL_MEM_READ_WRITE,
 4388            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
 4389        CL_CHECK(err);
 4390        auto previous_origin = region.origin;
 4391
 4392        // Subbuffer for qh
 4393        region.origin = align_to(previous_origin + size_ql, backend_ctx->alignment);
 4394        region.size = size_qh;
 4395        extra->qh = clCreateSubBuffer(
 4396            extra_orig->data_device, CL_MEM_READ_WRITE,
 4397            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
 4398        CL_CHECK(err);
 4399        previous_origin = region.origin;
 4400
 4401        // Subbuffer for scales
 4402        region.origin = align_to(previous_origin + size_qh, backend_ctx->alignment);
 4403        region.size = size_s;
 4404        extra->s = clCreateSubBuffer(
 4405            extra_orig->data_device, CL_MEM_READ_WRITE,
 4406            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
 4407        CL_CHECK(err);
 4408        previous_origin = region.origin;
 4409
 4410        // Create subbuffer for d.
 4411        region.origin = align_to(previous_origin + size_s, backend_ctx->alignment);
 4412        region.size = size_d;
 4413        extra->d = clCreateSubBuffer(
 4414            extra_orig->data_device, CL_MEM_READ_WRITE,
 4415            CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
 4416        CL_CHECK(err);
 4417        previous_origin = region.origin;
 4418
 4419        // Flatten the weights
 4420        cl_kernel kernel = backend_ctx->kernel_convert_block_q6_K;
 4421
 4422        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
 4423        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql));
 4424        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh));
 4425        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s));
 4426        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->d));
 4427
 4428        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
 4429        size_t local_work_size[] = {64, 1, 1};
 4430
 4431        cl_event evt;
 4432        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
 4433        CL_CHECK(clWaitForEvents(1, &evt));
 4434        CL_CHECK(clReleaseMemObject(data_device));
 4435
 4436        extra->size_ql = size_ql;
 4437        extra->size_qh = size_qh;
 4438        extra->size_s  = size_s;
 4439        extra->size_d  = size_d;
 4440
 4441        tensor->extra  = extra;
 4442        return;
 4443    }
 4444#endif // GGML_OPENCL_SOA_Q
 4445
 4446    ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
 4447    GGML_ASSERT(extra);
 4448
 4449    CL_CHECK(clEnqueueWriteBuffer(
 4450        queue, extra->data_device, CL_TRUE, extra->offset + offset,
 4451        size, data, 0, NULL, NULL));
 4452
 4453    GGML_UNUSED(buffer);
 4454}
 4455
 4456static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 4457    GGML_ASSERT(tensor->extra);
 4458
 4459    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
 4460
 4461    cl_context context = backend_ctx->context;
 4462    cl_command_queue queue = backend_ctx->queue;
 4463
 4464    // Make sure all previously submitted commands in other devices are finished.
 4465    sync_with_other_backends(backend_ctx);
 4466
 4467#ifdef GGML_OPENCL_SOA_Q
 4468    // In end-to-end runs, get_tensor is usually used to get back the logits,
 4469    // where we can simply do clEnqueueReadBuffer since they are f32.
 4470    // However, in test-backend-ops, the GPU graph is copied to the CPU backend,
 4471    // which requires reading back quantized weight tensors.
 4472    // To properly support this, we need to restore block_q4_0 struct arrays
 4473    // from the flattened buffers.
 4474    if (tensor->type == GGML_TYPE_Q4_0) {
 4475        ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
 4476
 4477#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 4478        if (use_adreno_kernels(backend_ctx, tensor)) {
 4479            cl_int err;
 4480            cl_kernel kernel;
 4481
 4482            cl_int M = tensor->ne[1];   // ne01
 4483            cl_int K = tensor->ne[0];   // ne00
 4484
 4485            GGML_ASSERT(K % 32 == 0);
 4486            GGML_ASSERT(M % 4 == 0);
 4487
 4488            size_t size_q = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2;
 4489            size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t);
 4490            GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
 4491
 4492            cl_mem buf_trans_q;
 4493            cl_mem buf_trans_d;
 4494
 4495            CL_CHECK((buf_trans_q = clCreateBuffer(context, CL_MEM_READ_WRITE,
 4496                size_q, NULL, &err), err));
 4497            CL_CHECK((buf_trans_d = clCreateBuffer(context, CL_MEM_READ_WRITE,
 4498                size_d, NULL, &err), err));
 4499
 4500            kernel = backend_ctx->kernel_transpose_16_buf;
 4501
 4502            // transpose q back
 4503            cl_int stride_k_q = K/4;
 4504            size_t local_size_q[3] = {64, 1, 1};
 4505            size_t global_size_q[3] = {(size_t)M, (size_t)stride_k_q, 1};
 4506
 4507            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
 4508            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_q));
 4509            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
 4510            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_q));
 4511
 4512            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
 4513                global_size_q, local_size_q, 0, NULL, NULL));
 4514
 4515            // transpose scales back
 4516            cl_int stride_k_d = K/32;
 4517            size_t local_size_d[3] = {64, 1, 1};
 4518            size_t global_size_d[3] = {(size_t)M, (size_t)stride_k_d, 1};
 4519
 4520            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->d));
 4521            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_d));
 4522            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_int), &M));
 4523            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &stride_k_d));
 4524
 4525            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
 4526                global_size_d, local_size_d, 0, NULL, NULL));
 4527
 4528            // unpack
 4529            cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
 4530                ggml_nbytes(tensor), NULL, &err);
 4531            CL_CHECK(err);
 4532
 4533            cl_uchar mask_0F = 0x0F;
 4534            cl_uchar mask_F0 = 0xF0;
 4535
 4536            size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
 4537            size_t local_work_size[] = {1, 1, 1};
 4538
 4539            kernel = backend_ctx->kernel_restore_block_q4_0_noshuffle;
 4540            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &buf_trans_q));
 4541            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem),   &buf_trans_d));
 4542            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &data_device));
 4543            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_uchar), &mask_0F));
 4544            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_F0));
 4545
 4546            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
 4547                global_work_size, local_work_size, 0, NULL, NULL));
 4548
 4549            // read back to host
 4550            CL_CHECK(clEnqueueReadBuffer(
 4551                queue, data_device, CL_TRUE, offset,
 4552                size, data, 0, NULL, NULL));
 4553
 4554            CL_CHECK(clReleaseMemObject(data_device));
 4555            CL_CHECK(clReleaseMemObject(buf_trans_q));
 4556            CL_CHECK(clReleaseMemObject(buf_trans_d));
 4557
 4558            return;
 4559        }
 4560#endif
 4561
 4562        cl_int err;
 4563        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
 4564            ggml_nbytes(tensor), NULL, &err);
 4565        CL_CHECK(err);
 4566
 4567        cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0;
 4568        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
 4569        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
 4570        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
 4571
 4572        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
 4573        size_t local_work_size[] = {1, 1, 1};
 4574
 4575        cl_event evt;
 4576        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
 4577            global_work_size, local_work_size, 0, NULL, &evt));
 4578        CL_CHECK(clWaitForEvents(1, &evt));
 4579        CL_CHECK(clEnqueueReadBuffer(
 4580            queue, data_device, CL_TRUE, offset,
 4581            size, data, 0, NULL, NULL));
 4582        CL_CHECK(clReleaseMemObject(data_device));
 4583        return;
 4584    } else if (tensor->type == GGML_TYPE_MXFP4) {
 4585        ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *)tensor->extra;
 4586
 4587        cl_int err;
 4588        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
 4589            ggml_nbytes(tensor), NULL, &err);
 4590        CL_CHECK(err);
 4591
 4592#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 4593        if (use_adreno_moe_kernels(backend_ctx, tensor)) {
 4594            cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4_trans;
 4595
 4596            int ne00 = tensor->ne[0];
 4597            int ne01 = tensor->ne[1];
 4598            int ne02 = tensor->ne[2];
 4599            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
 4600            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
 4601            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
 4602            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
 4603            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
 4604
 4605            size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
 4606            size_t local_work_size[3] = {64, 2, 1};
 4607
 4608            cl_event evt;
 4609            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
 4610                global_work_size, local_work_size, 0, NULL, &evt));
 4611            CL_CHECK(clWaitForEvents(1, &evt));
 4612            CL_CHECK(clEnqueueReadBuffer(
 4613                queue, data_device, CL_TRUE, offset,
 4614                size, data, 0, NULL, NULL));
 4615            CL_CHECK(clReleaseMemObject(data_device));
 4616            return;
 4617        }
 4618#endif
 4619        cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4;
 4620        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
 4621        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
 4622        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
 4623
 4624        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
 4625        size_t local_work_size[] = {1, 1, 1};
 4626
 4627        cl_event evt;
 4628        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
 4629            global_work_size, local_work_size, 0, NULL, &evt));
 4630        CL_CHECK(clWaitForEvents(1, &evt));
 4631        CL_CHECK(clEnqueueReadBuffer(
 4632            queue, data_device, CL_TRUE, offset,
 4633            size, data, 0, NULL, NULL));
 4634        CL_CHECK(clReleaseMemObject(data_device));
 4635        return;
 4636    }
 4637    if (tensor->type == GGML_TYPE_Q8_0) {
 4638        ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra;
 4639
 4640        cl_int err;
 4641        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
 4642            ggml_nbytes(tensor), NULL, &err);
 4643        CL_CHECK(err);
 4644
 4645#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 4646        if (enable_adreno_trans_weight(backend_ctx, tensor)) {
 4647            cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0_trans;
 4648
 4649            int ne00 = tensor->ne[0];
 4650            int ne01 = tensor->ne[1];
 4651            GGML_ASSERT(tensor->ne[2] == 1);  // ???
 4652            GGML_ASSERT(tensor->ne[3] == 1);  // ???
 4653
 4654            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
 4655            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
 4656            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
 4657            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
 4658            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
 4659
 4660            size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), 1, 1};
 4661            size_t local_work_size[3] = {64, 1, 1};
 4662
 4663            cl_event evt;
 4664            CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
 4665                global_work_size, local_work_size, 0, NULL, &evt));
 4666            CL_CHECK(clWaitForEvents(1, &evt));
 4667
 4668            CL_CHECK(clEnqueueReadBuffer(
 4669                queue, data_device, CL_TRUE, offset,
 4670                size, data, 0, NULL, NULL));
 4671            CL_CHECK(clReleaseMemObject(data_device));
 4672            return;
 4673        }
 4674#endif
 4675        cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
 4676        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
 4677        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
 4678        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
 4679
 4680        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
 4681        size_t local_work_size[] = {1, 1, 1};
 4682
 4683        cl_event evt;
 4684        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
 4685            global_work_size, local_work_size, 0, NULL, &evt));
 4686        CL_CHECK(clWaitForEvents(1, &evt));
 4687        CL_CHECK(clEnqueueReadBuffer(
 4688            queue, data_device, CL_TRUE, offset,
 4689            size, data, 0, NULL, NULL));
 4690        CL_CHECK(clReleaseMemObject(data_device));
 4691        return;
 4692    }
 4693    if (tensor->type == GGML_TYPE_Q6_K) {
 4694        ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra;
 4695
 4696        cl_int err;
 4697        cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
 4698            ggml_nbytes(tensor), NULL, &err);
 4699        CL_CHECK(err);
 4700
 4701        cl_kernel kernel = backend_ctx->kernel_restore_block_q6_K;
 4702        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql));
 4703        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh));
 4704        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->s));
 4705        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d));
 4706        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device));
 4707
 4708        size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
 4709        size_t local_work_size[] = {1, 1, 1};
 4710
 4711        cl_event evt;
 4712        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
 4713            global_work_size, local_work_size, 0, NULL, &evt));
 4714        CL_CHECK(clWaitForEvents(1, &evt));
 4715        CL_CHECK(clEnqueueReadBuffer(
 4716            queue, data_device, CL_TRUE, offset,
 4717            size, data, 0, NULL, NULL));
 4718        CL_CHECK(clReleaseMemObject(data_device));
 4719        return;
 4720    }
 4721#endif // GGML_OPENCL_SOA_Q
 4722
 4723    ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
 4724
 4725    CL_CHECK(clEnqueueReadBuffer(
 4726        queue, extra->data_device, CL_TRUE, extra->offset + tensor->view_offs + offset,
 4727        size, data, 0, NULL, NULL));
 4728
 4729    GGML_UNUSED(buffer);
 4730}
 4731
 4732static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
 4733    ggml_backend_dev_t dev = buffer->buft->device;
 4734    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
 4735    cl_command_queue queue = backend_ctx->queue;
 4736
 4737    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
 4738    for (cl_mem buf : ctx->buffer) {
 4739        CL_CHECK(clEnqueueFillBuffer(queue, buf, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
 4740    }
 4741    CL_CHECK(clFinish(queue));
 4742}
 4743
 4744static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
 4745    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
 4746    ctx->reset();
 4747}
 4748
 4749static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
 4750    /* .free_buffer     = */ ggml_backend_opencl_buffer_free_buffer,
 4751    /* .get_base        = */ ggml_backend_opencl_buffer_get_base,
 4752    /* .init_tensor     = */ ggml_backend_opencl_buffer_init_tensor,
 4753    /* .memset_tensor   = */ NULL,
 4754    /* .set_tensor      = */ ggml_backend_opencl_buffer_set_tensor,
 4755    /* .get_tensor      = */ ggml_backend_opencl_buffer_get_tensor,
 4756    /* .cpy_tensor      = */ NULL,
 4757    /* .clear           = */ ggml_backend_opencl_buffer_clear,
 4758    /* .reset           = */ ggml_backend_opencl_buffer_reset,
 4759};
 4760
 4761//
 4762// buffer type
 4763//
 4764
 4765static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type) {
 4766    return "OpenCL";
 4767
 4768    GGML_UNUSED(buffer_type);
 4769}
 4770
 4771static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
 4772    ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device);
 4773
 4774    // clCreateBuffer returns -61 for size 0
 4775    size = std::max(size, (size_t)1);
 4776
 4777    cl_int err;
 4778    cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err);
 4779    if (err != CL_SUCCESS) {
 4780        GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
 4781        return nullptr;
 4782    }
 4783
 4784    ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context(mem);
 4785
 4786    return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
 4787}
 4788
 4789static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
 4790    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
 4791    return backend_ctx->alignment;
 4792}
 4793
 4794static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
 4795    static size_t max_size = -1;
 4796    if (max_size == (size_t)-1) {
 4797        ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
 4798        max_size = backend_ctx->max_alloc_size;
 4799    }
 4800    return max_size;
 4801}
 4802
 4803static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
 4804    return ggml_backend_is_opencl(backend);
 4805
 4806    UNUSED(buft);
 4807}
 4808
 4809static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
 4810    /* .get_name         = */ ggml_backend_opencl_buffer_type_get_name,
 4811    /* .alloc_buffer     = */ ggml_backend_opencl_buffer_type_alloc_buffer,
 4812    /* .get_alignment    = */ ggml_backend_opencl_buffer_type_get_alignment,
 4813    /* .get_max_size     = */ ggml_backend_opencl_buffer_type_get_max_size,
 4814    /* .get_alloc_size   = */ NULL,
 4815    /* .is_host          = */ NULL,
 4816};
 4817
 4818//
 4819// backend device
 4820//
 4821
 4822static const char * ggml_backend_opencl_device_get_name(ggml_backend_dev_t dev) {
 4823    return "GPUOpenCL";
 4824
 4825    GGML_UNUSED(dev);
 4826}
 4827
 4828static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_t dev) {
 4829    ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
 4830    return dev_ctx->device_name.c_str();
 4831}
 4832
 4833static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
 4834    *free = 0;
 4835    *total = 0;
 4836
 4837    GGML_UNUSED(dev);
 4838}
 4839
 4840static enum ggml_backend_dev_type ggml_backend_opencl_device_get_type(ggml_backend_dev_t dev) {
 4841    return GGML_BACKEND_DEVICE_TYPE_GPU;
 4842
 4843    GGML_UNUSED(dev);
 4844}
 4845
 4846static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
 4847    props->name        = ggml_backend_opencl_device_get_name(dev);
 4848    props->description = ggml_backend_opencl_device_get_description(dev);
 4849    props->type        = ggml_backend_opencl_device_get_type(dev);
 4850    ggml_backend_opencl_device_get_memory(dev, &props->memory_free, &props->memory_total);
 4851    props->caps = ggml_backend_dev_caps {
 4852        /* .async                 = */ false,
 4853        /* .host_buffer           = */ false,
 4854        /* .buffer_from_host_ptr  = */ false,
 4855        /* .events                = */ false,
 4856    };
 4857}
 4858
 4859static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
 4860    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
 4861    // Getting a new reference to the backend, increase ref_count
 4862    backend_ctx->ref_count++;
 4863
 4864    ggml_backend_t backend = new ggml_backend {
 4865        /* .guid      = */ ggml_backend_opencl_guid(),
 4866        /* .interface = */ ggml_backend_opencl_i,
 4867        /* .device    = */ dev,
 4868        /* .context   = */ backend_ctx,
 4869    };
 4870
 4871    return backend;
 4872
 4873    GGML_UNUSED(params);
 4874}
 4875
 4876static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
 4877    auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(dev->context);
 4878
 4879    dev_ctx->buffer_type = ggml_backend_buffer_type{
 4880        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
 4881        /* .device  = */ dev,
 4882        /* .context = */ nullptr,
 4883    };
 4884
 4885    return &dev_ctx->buffer_type;
 4886}
 4887
 4888static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
 4889    GGML_UNUSED(dev);
 4890    GGML_UNUSED(ptr);
 4891    GGML_UNUSED(size);
 4892    GGML_UNUSED(max_tensor_size);
 4893    return nullptr;
 4894}
 4895
 4896static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
 4897    return ggml_opencl_supports_op(dev, op);
 4898}
 4899
 4900static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
 4901    // Check 'dev' and 'buffer_type' are not objects belonging to this backend.
 4902    if (dev->iface.get_name != ggml_backend_opencl_device_get_name ||
 4903        buft->iface.get_name != ggml_backend_opencl_buffer_type_get_name) {
 4904        return false;
 4905    }
 4906
 4907    // Check cl_context is the same. clEnqueue* commands may not use
 4908    // buffers from another cl_context.
 4909    ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
 4910    ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
 4911    return backend_ctx0->context == backend_ctx1->context;
 4912}
 4913
 4914namespace /* anonymous */ {
 4915struct ggml_backend_device_i ggml_backend_opencl_device_i = {
 4916    /* .get_name             = */ ggml_backend_opencl_device_get_name,
 4917    /* .get_description      = */ ggml_backend_opencl_device_get_description,
 4918    /* .get_memory           = */ ggml_backend_opencl_device_get_memory,
 4919    /* .get_type             = */ ggml_backend_opencl_device_get_type,
 4920    /* .get_props            = */ ggml_backend_opencl_device_get_props,
 4921    /* .init_backend         = */ ggml_backend_opencl_device_init,
 4922    /* .get_buffer_type      = */ ggml_backend_opencl_device_get_buffer_type,
 4923    /* .get_host_buffer_type = */ NULL,
 4924    /* .buffer_from_host_ptr = */ ggml_backend_opencl_device_buffer_from_ptr,
 4925    /* .supports_op          = */ ggml_backend_opencl_device_supports_op,
 4926    /* .supports_buft        = */ ggml_backend_opencl_device_supports_buft,
 4927    /* .offload_op           = */ NULL,
 4928    /* .event_new            = */ NULL,
 4929    /* .event_free           = */ NULL,
 4930    /* .event_synchronize    = */ NULL,
 4931};
 4932}
 4933
 4934// Backend registry
 4935
 4936static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
 4937    return "OpenCL";
 4938
 4939    GGML_UNUSED(reg);
 4940}
 4941
 4942static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
 4943    return g_ggml_backend_opencl_devices.size();
 4944
 4945    GGML_UNUSED(reg);
 4946}
 4947
 4948static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
 4949    GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg));
 4950
 4951    return &g_ggml_backend_opencl_devices[index];
 4952
 4953    GGML_UNUSED(reg);
 4954    GGML_UNUSED(index);
 4955}
 4956
 4957static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
 4958    /* .get_name         = */ ggml_backend_opencl_reg_get_name,
 4959    /* .device_count     = */ ggml_backend_opencl_reg_device_count,
 4960    /* .device_get       = */ ggml_backend_opencl_reg_device_get,
 4961    /* .get_proc_address = */ NULL,
 4962};
 4963
 4964ggml_backend_reg_t ggml_backend_opencl_reg(void) {
 4965    static std::mutex mutex;
 4966    static ggml_backend_reg reg;
 4967    static bool initialized = false;
 4968    std::lock_guard<std::mutex> lock(mutex);
 4969
 4970    if (initialized) {
 4971        return &reg;
 4972    }
 4973    initialized = true;
 4974
 4975    g_ggml_backend_opencl_devices = ggml_opencl_probe_devices(&reg);
 4976
 4977    reg = ggml_backend_reg{
 4978        /* .api_version = */ GGML_BACKEND_API_VERSION,
 4979        /* .iface       = */ ggml_backend_opencl_reg_i,
 4980        /* .context     = */ NULL,
 4981    };
 4982
 4983    return &reg;
 4984}
 4985
 4986GGML_BACKEND_DL_IMPL(ggml_backend_opencl_reg)
 4987
 4988//------------------------------------------------------------------------------
 4989// Debugging utils
 4990//------------------------------------------------------------------------------
 4991#if 0
 4992#define QK4_0 32
 4993typedef struct {
 4994    ggml_fp16_t d;          // delta
 4995    uint8_t qs[QK4_0 / 2];  // nibbles / quants
 4996} block_q4_0;
 4997static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2,
 4998    "wrong q4_0 block size/padding");
 4999
 5000#include <math.h>
 5001#ifdef __cplusplus
 5002#include "half.hpp"
 5003#endif
 5004
 5005static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tensor) {
 5006    void * buf = malloc(ggml_nbytes(tensor));
 5007
 5008    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 5009    cl_command_queue queue = backend_ctx->queue;
 5010#ifdef GGML_OPENCL_SOA_Q
 5011    void * buf_q;
 5012    void * buf_d;
 5013#endif
 5014
 5015    // Make sure everything is done.
 5016    CL_CHECK(clFinish(queue));
 5017
 5018#ifdef GGML_OPENCL_SOA_Q
 5019    if (tensor->type == GGML_TYPE_Q4_0) {
 5020        ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *) tensor->extra;
 5021        GGML_ASSERT(extra);
 5022
 5023        size_t size_q = ggml_nelements(tensor)/QK4_0 * QK4_0/2;
 5024        size_t size_d = ggml_nelements(tensor)/QK4_0 * sizeof(ggml_fp16_t);
 5025        GGML_ASSERT(size_q + size_d == ggml_nbytes(tensor));
 5026        buf_q = malloc(size_q);
 5027        buf_d = malloc(size_d);
 5028
 5029        CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
 5030        CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_d, buf_d, 0, NULL, NULL));
 5031        CL_CHECK(clFinish(queue));
 5032    } else if (tensor->type == GGML_TYPE_MXFP4) {
 5033        ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *) tensor->extra;
 5034        GGML_ASSERT(extra);
 5035
 5036        size_t size_q = ggml_nelements(tensor)/QK_MXFP4 * QK_MXFP4/2;
 5037        size_t size_e = ggml_nelements(tensor)/QK_MXFP4 * sizeof(char);
 5038        GGML_ASSERT(size_q + size_e == ggml_nbytes(tensor));
 5039        buf_q = malloc(size_q);
 5040        buf_d = malloc(size_e);
 5041
 5042        CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
 5043        CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_e, buf_d, 0, NULL, NULL));
 5044        CL_CHECK(clFinish(queue));
 5045    } else {
 5046        // Read out the tensor from GPU memory.
 5047        ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
 5048        GGML_ASSERT(extra);
 5049
 5050        CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
 5051        extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
 5052        CL_CHECK(clFinish(queue));
 5053    }
 5054#else
 5055    // Read out the tensor from GPU memory.
 5056    ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
 5057    GGML_ASSERT(extra);
 5058
 5059    CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE,
 5060        extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
 5061    CL_CHECK(clFinish(queue));
 5062#endif // GGML_OPENCL_SOA_Q
 5063
 5064    // Open file and dump.
 5065    char fname[512];
 5066    snprintf(fname, sizeof(fname), "./tensor-dumps/%s.txt", tensor->name);
 5067    FILE * f = fopen(fname, "w");
 5068    if (!f) {
 5069        printf("Failed to open %s\n", fname);
 5070        return;
 5071    }
 5072
 5073    if (tensor->type == GGML_TYPE_F32) {
 5074        float * data = (float *) buf;
 5075        for (int i = 0; i < ggml_nelements(tensor); ++i) {
 5076            if (isnan(data[i])) {
 5077                printf("NaN found: %s\n", tensor->name);
 5078                break;
 5079            }
 5080            fprintf(f, "%f\n", data[i]);
 5081        }
 5082    } else if (tensor->type == GGML_TYPE_I32) {
 5083        int * data = (int *) buf;
 5084        for (int i = 0; i < ggml_nelements(tensor); ++i) {
 5085            if (isnan(data[i])) {
 5086                printf("NaN found: %s\n", tensor->name);
 5087                break;
 5088            }
 5089            fprintf(f, "%d\n", data[i]);
 5090        }
 5091    } else if (tensor->type == GGML_TYPE_F16) {
 5092#ifdef __cplusplus
 5093        half_float::half * data = (half_float::half *) buf;
 5094        for (int i = 0; i < ggml_nelements(tensor); ++i) {
 5095            if (std::isnan(data[i])) {
 5096                printf("NaN found: %s\n", tensor->name);
 5097                break;
 5098            }
 5099            fprintf(f, "%f\n", float(data[i]));
 5100        }
 5101#endif
 5102    } else if (tensor->type == GGML_TYPE_Q4_0) {
 5103#ifdef GGML_OPENCL_SOA_Q
 5104        ggml_fp16_t * data_d = (ggml_fp16_t *)buf_d;
 5105        unsigned char * data_q = (unsigned char *)buf_q;
 5106
 5107        for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
 5108            fprintf(f, "%04x, ", data_d[i]);
 5109            for (int k = 0; k < QK4_0/2; ++k) {
 5110                fprintf(f, "%02x, ", data_q[k]);
 5111            }
 5112            fprintf(f, "\n");
 5113            data_q += QK4_0/2;
 5114        }
 5115        free(buf_d);
 5116        free(buf_q);
 5117#else
 5118        block_q4_0 * data = (block_q4_0 *) buf;
 5119        for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) {
 5120            fprintf(f, "%04x, ", data[i].d);
 5121            for (int k = 0; k < QK4_0/2; ++k) {
 5122                fprintf(f, "%02x, ", data[i].qs[k]);
 5123            }
 5124            fprintf(f, "\n");
 5125        }
 5126#endif // GGML_OPENCL_SOA_Q
 5127    }
 5128    free(buf);
 5129    fflush(f);
 5130    fclose(f);
 5131}
 5132#else
 5133#define dump_tensor(tensor)
 5134#endif
 5135
 5136//------------------------------------------------------------------------------
 5137// Ops
 5138//------------------------------------------------------------------------------
 5139
 5140static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
 5141    const int64_t ne10 = src1->ne[0];
 5142
 5143    const int64_t ne0 = dst->ne[0];
 5144    const int64_t ne1 = dst->ne[1];
 5145
 5146    // TODO: find the optimal values for these
 5147    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
 5148            src1->type == GGML_TYPE_F32 &&
 5149             dst->type == GGML_TYPE_F32 &&
 5150            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
 5151}
 5152
 5153// Copy a noncontiguous tensor to contiguous tensor. ne[] remains the same but
 5154// nb[] is recalculated such that tensor is contiguous.
 5155static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor * src, cl_mem dst,
 5156                                       cl_ulong &nb0, cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) {
 5157    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 5158
 5159    const int tensor_type_size = ggml_type_size(src->type);
 5160
 5161    const int ne00 = src->ne[0];
 5162    const int ne01 = src->ne[1];
 5163    const int ne02 = src->ne[2];
 5164    const int ne03 = src->ne[3];
 5165
 5166    const cl_ulong nb00 = src->nb[0];
 5167    const cl_ulong nb01 = src->nb[1];
 5168    const cl_ulong nb02 = src->nb[2];
 5169    const cl_ulong nb03 = src->nb[3];
 5170
 5171    const int ne0 = src->ne[0];
 5172    const int ne1 = src->ne[1];
 5173    const int ne2 = src->ne[2];
 5174    const int ne3 = src->ne[3];
 5175
 5176    nb0 = tensor_type_size;
 5177    nb1 = tensor_type_size*ne00;
 5178    nb2 = tensor_type_size*ne00*ne01;
 5179    nb3 = tensor_type_size*ne00*ne01*ne02;
 5180
 5181    ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *)src->extra;
 5182
 5183    cl_ulong offset0 = extra->offset + src->view_offs;
 5184    cl_ulong offsetd = 0;
 5185
 5186    cl_kernel kernel;
 5187
 5188    switch (src->type) {
 5189        case GGML_TYPE_F32:
 5190            kernel = backend_ctx->kernel_cpy_f32_f32;
 5191            break;
 5192        case GGML_TYPE_F16:
 5193            kernel = backend_ctx->kernel_cpy_f16_f16;
 5194            break;
 5195        default:
 5196            GGML_ASSERT(false && "not implemented");
 5197    }
 5198
 5199    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra->data_device));
 5200    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 5201    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &dst));
 5202    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offsetd));
 5203    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne00));
 5204    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),      &ne01));
 5205    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne02));
 5206    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne03));
 5207    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb00));
 5208    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb01));
 5209    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
 5210    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
 5211    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne0));
 5212    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne1));
 5213    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne2));
 5214    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne3));
 5215    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb0));
 5216    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1));
 5217    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2));
 5218    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3));
 5219
 5220    const int nth = MIN(64, ne00);
 5221
 5222    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
 5223    size_t local_work_size[] = {(size_t)nth, 1, 1};
 5224
 5225    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src);
 5226}
 5227
 5228static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 5229    UNUSED(backend);
 5230    UNUSED(src0);
 5231    UNUSED(src1);
 5232    UNUSED(dst);
 5233}
 5234
 5235static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 5236    GGML_ASSERT(src0);
 5237    GGML_ASSERT(src0->extra);
 5238    GGML_ASSERT(src1);
 5239    GGML_ASSERT(src1->extra);
 5240    GGML_ASSERT(dst);
 5241    GGML_ASSERT(dst->extra);
 5242
 5243    const int      ne00 = src0->ne[0];
 5244    const cl_ulong nb01 = src0->nb[1];
 5245    const cl_ulong nb02 = src0->nb[2];
 5246    const cl_ulong nb03 = src0->nb[3];
 5247    const int      ne10 = src1->ne[0];
 5248    const cl_ulong nb10 = src1->nb[0];
 5249    const int      ne11 = src1->ne[1];
 5250    const int      ne12 = src1->ne[2];
 5251    const cl_ulong nb11 = src1->nb[1];
 5252    const cl_ulong nb12 = src1->nb[2];
 5253    const cl_ulong nb1  = dst->nb[1];
 5254    const cl_ulong nb2  = dst->nb[2];
 5255    const cl_ulong nb3  = dst->nb[3];
 5256
 5257    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 5258
 5259    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 5260    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 5261    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 5262
 5263    cl_ulong offset0 = extra0->offset + src0->view_offs;
 5264    cl_ulong offset1 = extra1->offset + src1->view_offs;
 5265    cl_ulong offsetd = extrad->offset + dst->view_offs;
 5266
 5267    cl_kernel kernel;
 5268
 5269    switch (src0->type) {
 5270        case GGML_TYPE_F32:
 5271            kernel = backend_ctx->kernel_get_rows_f32;
 5272            break;
 5273        case GGML_TYPE_F16:
 5274            kernel = backend_ctx->kernel_get_rows_f16;
 5275            break;
 5276        case GGML_TYPE_Q4_0:
 5277            kernel = backend_ctx->kernel_get_rows_q4_0;
 5278            break;
 5279        default:
 5280            GGML_ASSERT(false && "not implemented");
 5281    }
 5282
 5283    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 5284    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 5285    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 5286    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 5287    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 5288    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 5289    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 5290    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
 5291    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
 5292    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb03));
 5293    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne10));
 5294    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb10));
 5295    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
 5296    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
 5297    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
 5298    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
 5299    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
 5300
 5301    size_t global_work_size[] = {(size_t)ne10*64, (size_t)ne11, (size_t)ne12};
 5302    size_t local_work_size[] = {64, 1, 1};
 5303
 5304    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 5305}
 5306
 5307static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 5308    GGML_ASSERT(src0);
 5309    GGML_ASSERT(src0->extra);
 5310    GGML_ASSERT(src1);
 5311    GGML_ASSERT(src1->extra);
 5312    GGML_ASSERT(dst);
 5313    GGML_ASSERT(dst->extra);
 5314    GGML_ASSERT(src1->type == GGML_TYPE_I64 || src1->type == GGML_TYPE_I32);
 5315
 5316    // ne0 = ne00
 5317    // ne2 = ne02
 5318    // ne3 = ne03
 5319
 5320    const int      ne01 = src0->ne[1];
 5321    const int      ne02 = src0->ne[2];
 5322    const int      ne03 = src0->ne[3];
 5323
 5324    const cl_ulong nb01 = src0->nb[1];
 5325    const cl_ulong nb02 = src0->nb[2];
 5326    const cl_ulong nb03 = src0->nb[3];
 5327
 5328    const int      ne11 = src1->ne[1];
 5329    const int      ne12 = src1->ne[2];
 5330
 5331    const cl_ulong nb10 = src1->nb[0];
 5332    const cl_ulong nb11 = src1->nb[1];
 5333    const cl_ulong nb12 = src1->nb[2];
 5334
 5335    const int      ne0  = dst->ne[0];
 5336
 5337    const cl_ulong nb1  = dst->nb[1];
 5338    const cl_ulong nb2  = dst->nb[2];
 5339    const cl_ulong nb3  = dst->nb[3];
 5340
 5341    const int nblk0 = ne0/ggml_blck_size(dst->type);
 5342
 5343    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 5344
 5345    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 5346    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 5347    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 5348
 5349    cl_ulong offset0 = extra0->offset + src0->view_offs;
 5350    cl_ulong offset1 = extra1->offset + src1->view_offs;
 5351    cl_ulong offsetd = extrad->offset + dst->view_offs;
 5352
 5353    cl_kernel kernel;
 5354
 5355    switch (dst->type) {
 5356        case GGML_TYPE_F32:
 5357            if (src1->type == GGML_TYPE_I64) {
 5358                kernel = backend_ctx->kernel_set_rows_f32_i64;
 5359            } else {
 5360                kernel = backend_ctx->kernel_set_rows_f32_i32;
 5361            }
 5362            break;
 5363        case GGML_TYPE_F16:
 5364            if (src1->type == GGML_TYPE_I64) {
 5365                kernel = backend_ctx->kernel_set_rows_f16_i64;
 5366            } else {
 5367                kernel = backend_ctx->kernel_set_rows_f16_i32;
 5368            }
 5369            break;
 5370        default:
 5371            GGML_ABORT("not implemented");
 5372    }
 5373
 5374    fastdiv_vals ne11_ = init_fastdiv_values(ne11);
 5375    fastdiv_vals ne12_ = init_fastdiv_values(ne12);
 5376
 5377    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 5378    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 5379    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 5380    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 5381    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 5382    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 5383    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne01));
 5384    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
 5385    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
 5386    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb03));
 5387    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne11_));
 5388    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(fastdiv_vals), &ne12_));
 5389    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
 5390    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
 5391    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
 5392    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &nblk0));
 5393    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb1));
 5394    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb2));
 5395    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb3));
 5396
 5397    int nth0 = 64;
 5398    if (backend_ctx->gpu_family == INTEL) {
 5399        nth0 = 32;
 5400    } else if (backend_ctx->gpu_family == ADRENO) {
 5401        nth0 = 64;
 5402    }
 5403
 5404    int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
 5405    while (nth0 < nblk0 && nth0 < max_workgroup_size) {
 5406        nth0 *= 2;
 5407    }
 5408
 5409    int rows_per_workgroup = 1;
 5410    if (nth0 > nblk0) {
 5411        rows_per_workgroup = nth0 / nblk0;
 5412        nth0 = nblk0;
 5413    }
 5414
 5415    size_t global_work_size[] = {
 5416        (size_t)(ne01 + rows_per_workgroup - 1)/rows_per_workgroup*nth0,
 5417        (size_t)ne02*rows_per_workgroup,
 5418        (size_t)ne03};
 5419    size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1};
 5420
 5421    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 5422}
 5423
 5424static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 5425    GGML_ASSERT(src0);
 5426    GGML_ASSERT(src0->extra);
 5427    GGML_ASSERT(src1);
 5428    GGML_ASSERT(src1->extra);
 5429    GGML_ASSERT(dst);
 5430    GGML_ASSERT(dst->extra);
 5431
 5432    const int ne00 = src0->ne[0];
 5433    const int ne01 = src0->ne[1];
 5434    const int ne02 = src0->ne[2];
 5435    const int ne03 = src0->ne[3];
 5436
 5437    const cl_ulong nb00 = src0->nb[0];
 5438    const cl_ulong nb01 = src0->nb[1];
 5439    const cl_ulong nb02 = src0->nb[2];
 5440    const cl_ulong nb03 = src0->nb[3];
 5441
 5442    const int ne10 = src1->ne[0];
 5443    const int ne11 = src1->ne[1];
 5444    const int ne12 = src1->ne[2];
 5445    const int ne13 = src1->ne[3];
 5446
 5447    const cl_ulong nb10 = src1->nb[0];
 5448    const cl_ulong nb11 = src1->nb[1];
 5449    const cl_ulong nb12 = src1->nb[2];
 5450    const cl_ulong nb13 = src1->nb[3];
 5451
 5452    const int ne0  = dst->ne[0];
 5453    const int ne1  = dst->ne[1];
 5454    const int ne2  = dst->ne[2];
 5455    const int ne3  = dst->ne[3];
 5456
 5457    const cl_ulong nb0  = dst->nb[0];
 5458    const cl_ulong nb1  = dst->nb[1];
 5459    const cl_ulong nb2  = dst->nb[2];
 5460    const cl_ulong nb3  = dst->nb[3];
 5461
 5462    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 5463
 5464    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 5465    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 5466    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 5467
 5468    cl_ulong offset0 = extra0->offset + src0->view_offs;
 5469    cl_ulong offset1 = extra1->offset + src1->view_offs;
 5470    cl_ulong offsetd = extrad->offset + dst->view_offs;
 5471
 5472    cl_kernel kernel;
 5473
 5474    const bool bcast_row = ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0;
 5475
 5476    if (bcast_row) {
 5477        GGML_ASSERT(ggml_is_contiguous(src0));
 5478        GGML_ASSERT(ne11 == 1);
 5479    }
 5480
 5481    if (dst->type == GGML_TYPE_F32) {
 5482        GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32);
 5483        if (bcast_row) {
 5484            kernel = backend_ctx->kernel_add_row;
 5485            const int ne = ne00 / 4;
 5486            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 5487            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 5488            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
 5489            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
 5490            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
 5491            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
 5492            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
 5493        } else {
 5494            kernel = backend_ctx->kernel_add;
 5495            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 5496            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 5497            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 5498            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 5499            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 5500            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 5501            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 5502            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 5503            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 5504            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne03));
 5505            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
 5506            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
 5507            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
 5508            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
 5509            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne10));
 5510            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne11));
 5511            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne12));
 5512            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne13));
 5513            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
 5514            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
 5515            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
 5516            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
 5517            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &ne0));
 5518            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &ne1));
 5519            CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &ne2));
 5520            CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int),      &ne3));
 5521            CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
 5522            CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
 5523            CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
 5524            CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
 5525        }
 5526    } else if (dst->type == GGML_TYPE_F16) {
 5527        GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32);
 5528        GGML_ASSERT(src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
 5529        const int type_src0 = (src0->type == GGML_TYPE_F32);
 5530        const int type_src1 = (src1->type == GGML_TYPE_F32);
 5531        if (bcast_row) {
 5532            kernel = backend_ctx->kernel_add_row_f16;
 5533            const int ne = ne00 / 4;
 5534            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 5535            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 5536            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
 5537            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
 5538            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
 5539            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
 5540            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
 5541            CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &type_src0));
 5542            CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),      &type_src1));
 5543        } else {
 5544            kernel = backend_ctx->kernel_add_f16;
 5545            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 5546            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 5547            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 5548            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 5549            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 5550            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 5551            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 5552            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 5553            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 5554            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne03));
 5555            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
 5556            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
 5557            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
 5558            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
 5559            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne10));
 5560            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne11));
 5561            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne12));
 5562            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne13));
 5563            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
 5564            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
 5565            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
 5566            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
 5567            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &ne0));
 5568            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &ne1));
 5569            CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &ne2));
 5570            CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int),      &ne3));
 5571            CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
 5572            CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
 5573            CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
 5574            CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
 5575            CL_CHECK(clSetKernelArg(kernel, 30, sizeof(int),      &type_src0));
 5576            CL_CHECK(clSetKernelArg(kernel, 31, sizeof(int),      &type_src1));
 5577        }
 5578    } else {
 5579        GGML_ASSERT(false && "unsupported data types for add");
 5580    }
 5581
 5582    if (bcast_row) {
 5583        int n = ggml_nelements(dst)/4;
 5584        size_t global_work_size[] = {(size_t)n, 1, 1};
 5585        size_t local_work_size[] = {64, 1, 1};
 5586
 5587        size_t * local_work_size_ptr = local_work_size;
 5588        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
 5589            local_work_size_ptr = nullptr;
 5590        }
 5591
 5592        backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size_ptr, dst);
 5593    } else {
 5594        unsigned int nth = MIN(64, ne0);
 5595        size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
 5596        size_t local_work_size[] = {nth, 1, 1};
 5597
 5598        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 5599    }
 5600}
 5601
 5602static void ggml_cl_add_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 5603    GGML_ASSERT(src0);
 5604    GGML_ASSERT(src0->extra);
 5605    GGML_ASSERT(src1);
 5606    GGML_ASSERT(src1->extra);
 5607    GGML_ASSERT(dst);
 5608    GGML_ASSERT(dst->extra);
 5609
 5610    const ggml_tensor * src2 = dst->src[2];
 5611    GGML_ASSERT(src2);
 5612    GGML_ASSERT(src2->extra);
 5613
 5614    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 5615    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 5616    GGML_ASSERT(src2->type == GGML_TYPE_I32);
 5617    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
 5618
 5619    GGML_ASSERT(ggml_is_contiguous_rows(src0));
 5620
 5621    const int ne00 = src0->ne[0];
 5622    const int ne01 = src0->ne[1];
 5623    const int ne02 = src0->ne[2];
 5624
 5625    const cl_ulong nb01 = src0->nb[1];
 5626    const cl_ulong nb02 = src0->nb[2];
 5627
 5628    const cl_ulong nb11 = src1->nb[1];
 5629
 5630    const cl_ulong nb21 = src2->nb[1];
 5631
 5632    const int ne0 = dst->ne[0];
 5633    const int ne1 = dst->ne[1];
 5634
 5635    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 5636
 5637    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 5638    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 5639    ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
 5640    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 5641
 5642    cl_ulong offset0 = extra0->offset + src0->view_offs;
 5643    cl_ulong offset1 = extra1->offset + src1->view_offs;
 5644    cl_ulong offset2 = extra2->offset + src2->view_offs;
 5645    cl_ulong offsetd = extrad->offset + dst->view_offs;
 5646
 5647    cl_kernel kernel = backend_ctx->kernel_add_id;
 5648
 5649    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 5650    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 5651    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 5652    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 5653    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extra2->data_device));
 5654    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
 5655    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
 5656    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
 5657    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb01));
 5658    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb02));
 5659    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb11));
 5660    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb21));
 5661    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne0));
 5662    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne1));
 5663
 5664    int nth = MIN(ne00, (int) backend_ctx->get_kernel_workgroup_size(kernel));
 5665    size_t global_work_size[] = { (size_t)ne01*nth, (size_t)ne02, 1 };
 5666    size_t local_work_size[] = { (size_t)nth, 1, 1 };
 5667
 5668    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 5669}
 5670
 5671static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 5672    GGML_ASSERT(src0);
 5673    GGML_ASSERT(src0->extra);
 5674    GGML_ASSERT(src1);
 5675    GGML_ASSERT(src1->extra);
 5676    GGML_ASSERT(dst);
 5677    GGML_ASSERT(dst->extra);
 5678
 5679    GGML_ASSERT(src0->type == src1->type);
 5680    GGML_ASSERT(src0->type == dst->type);
 5681    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
 5682
 5683    const int ne00 = src0->ne[0];
 5684    const int ne01 = src0->ne[1];
 5685    const int ne02 = src0->ne[2];
 5686    const int ne03 = src0->ne[3];
 5687
 5688    const cl_ulong nb00 = src0->nb[0];
 5689    const cl_ulong nb01 = src0->nb[1];
 5690    const cl_ulong nb02 = src0->nb[2];
 5691    const cl_ulong nb03 = src0->nb[3];
 5692
 5693    const int ne10 = src1->ne[0];
 5694    const int ne11 = src1->ne[1];
 5695    const int ne12 = src1->ne[2];
 5696    const int ne13 = src1->ne[3]; UNUSED(ne13);
 5697
 5698    const cl_ulong nb10 = src1->nb[0];
 5699    const cl_ulong nb11 = src1->nb[1];
 5700    const cl_ulong nb12 = src1->nb[2];
 5701    const cl_ulong nb13 = src1->nb[3]; UNUSED(nb13);
 5702
 5703    const int ne0  = dst->ne[0];
 5704    const int ne1  = dst->ne[1];
 5705    const int ne2  = dst->ne[2];
 5706    const int ne3  = dst->ne[3];
 5707
 5708    const cl_ulong nb0  = dst->nb[0];
 5709    const cl_ulong nb1  = dst->nb[1];
 5710    const cl_ulong nb2  = dst->nb[2];
 5711    const cl_ulong nb3  = dst->nb[3];
 5712
 5713    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 5714
 5715    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 5716    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 5717    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 5718
 5719    cl_ulong offset0 = extra0->offset + src0->view_offs;
 5720    cl_ulong offset1 = extra1->offset + src1->view_offs;
 5721    cl_ulong offsetd = extrad->offset + dst->view_offs;
 5722
 5723    bool bcast_row = false;
 5724    cl_kernel kernel;
 5725
 5726    if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
 5727        GGML_ASSERT(ggml_is_contiguous(src0));
 5728
 5729        // src1 is a row
 5730        GGML_ASSERT(ne11 == 1);
 5731
 5732        bcast_row = true;
 5733        int ne = ne00 / 4;
 5734
 5735        if (src0->type == GGML_TYPE_F32) {
 5736            kernel = backend_ctx->kernel_mul_row;
 5737        } else {
 5738            kernel = backend_ctx->kernel_mul_row_f16;
 5739        }
 5740
 5741        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 5742        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 5743        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
 5744        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
 5745        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
 5746        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
 5747        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
 5748    } else {
 5749        if (src0->type == GGML_TYPE_F32) {
 5750            kernel = backend_ctx->kernel_mul;
 5751        } else {
 5752            kernel = backend_ctx->kernel_mul_f16;
 5753        }
 5754
 5755        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 5756        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 5757        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 5758        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 5759        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 5760        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 5761        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 5762        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 5763        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 5764        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne03));
 5765        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
 5766        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
 5767        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
 5768        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
 5769        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne10));
 5770        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne11));
 5771        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne12));
 5772        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne13));
 5773        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10));
 5774        CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
 5775        CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
 5776        CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
 5777        CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &ne0));
 5778        CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &ne1));
 5779        CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &ne2));
 5780        CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int),      &ne3));
 5781        CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0));
 5782        CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1));
 5783        CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2));
 5784        CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3));
 5785    }
 5786
 5787    if (bcast_row) {
 5788        int n = ggml_nelements(dst)/4;
 5789        size_t global_work_size[] = {(size_t)n, 1, 1};
 5790        size_t local_work_size[] = {64, 1, 1};
 5791
 5792        size_t * local_work_size_ptr = local_work_size;
 5793        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
 5794            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
 5795        }
 5796
 5797        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 5798    } else {
 5799        unsigned int nth = MIN(64, ne0);
 5800        size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
 5801        size_t local_work_size[] = {nth, 1, 1};
 5802
 5803        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 5804    }
 5805}
 5806
 5807static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 5808    GGML_ASSERT(src0);
 5809    GGML_ASSERT(src0->extra);
 5810    GGML_ASSERT(src1);
 5811    GGML_ASSERT(src1->extra);
 5812    GGML_ASSERT(dst);
 5813    GGML_ASSERT(dst->extra);
 5814
 5815    GGML_ASSERT(src0->type == src1->type);
 5816    GGML_ASSERT(src0->type == dst->type);
 5817    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
 5818
 5819    const int ne00 = src0->ne[0];
 5820    const int ne01 = src0->ne[1];
 5821    const int ne02 = src0->ne[2];
 5822    const int ne03 = src0->ne[3];
 5823
 5824    const cl_ulong nb00 = src0->nb[0];
 5825    const cl_ulong nb01 = src0->nb[1];
 5826    const cl_ulong nb02 = src0->nb[2];
 5827    const cl_ulong nb03 = src0->nb[3];
 5828
 5829    const int ne10 = src1->ne[0];
 5830    const int ne11 = src1->ne[1];
 5831    const int ne12 = src1->ne[2];
 5832    const int ne13 = src1->ne[3];
 5833
 5834    const cl_ulong nb10 = src1->nb[0];
 5835    const cl_ulong nb11 = src1->nb[1];
 5836    const cl_ulong nb12 = src1->nb[2];
 5837    const cl_ulong nb13 = src1->nb[3];
 5838
 5839    const int ne0  = dst->ne[0];
 5840
 5841    const cl_ulong nb0  = dst->nb[0];
 5842    const cl_ulong nb1  = dst->nb[1];
 5843    const cl_ulong nb2  = dst->nb[2];
 5844    const cl_ulong nb3  = dst->nb[3];
 5845
 5846    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 5847
 5848    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 5849    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 5850    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 5851
 5852    cl_ulong offset0 = extra0->offset + src0->view_offs;
 5853    cl_ulong offset1 = extra1->offset + src1->view_offs;
 5854    cl_ulong offsetd = extrad->offset + dst->view_offs;
 5855
 5856    bool bcast_row = false;
 5857    cl_kernel kernel;
 5858
 5859    if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
 5860        GGML_ASSERT(ggml_is_contiguous(src0));
 5861
 5862        // src1 is a row
 5863        GGML_ASSERT(ne11 == 1);
 5864
 5865        bcast_row = true;
 5866        int ne = ne00 / 4;
 5867
 5868        if (src0->type == GGML_TYPE_F32) {
 5869            kernel = backend_ctx->kernel_div_row;
 5870        } else {
 5871            kernel = backend_ctx->kernel_div_row_f16;
 5872        }
 5873
 5874        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 5875        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 5876        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
 5877        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
 5878        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
 5879        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
 5880        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
 5881    } else {
 5882        if (src0->type == GGML_TYPE_F32) {
 5883            kernel = backend_ctx->kernel_div;
 5884        } else {
 5885            kernel = backend_ctx->kernel_div_f16;
 5886        }
 5887
 5888        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 5889        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 5890        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 5891        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 5892        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 5893        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 5894        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &nb00));
 5895        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
 5896        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
 5897        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb03));
 5898        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne10));
 5899        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne11));
 5900        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne12));
 5901        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne13));
 5902        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
 5903        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
 5904        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
 5905        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
 5906        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne0));
 5907        CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
 5908        CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
 5909        CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
 5910        CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
 5911    }
 5912
 5913    if (bcast_row) {
 5914        int n = ggml_nelements(dst)/4;
 5915        size_t global_work_size[] = {(size_t)n, 1, 1};
 5916        size_t local_work_size[] = {64, 1, 1};
 5917
 5918        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 5919    } else {
 5920        unsigned int nth = MIN(64, ne0);
 5921        size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
 5922        size_t local_work_size[] = {nth, 1, 1};
 5923
 5924        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 5925    }
 5926}
 5927
 5928static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 5929    GGML_ASSERT(src0);
 5930    GGML_ASSERT(src0->extra);
 5931    GGML_ASSERT(src1);
 5932    GGML_ASSERT(src1->extra);
 5933    GGML_ASSERT(dst);
 5934    GGML_ASSERT(dst->extra);
 5935
 5936    GGML_ASSERT(src0->type == src1->type);
 5937    GGML_ASSERT(src0->type == dst->type);
 5938    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
 5939
 5940    const int ne00 = src0->ne[0];
 5941    const int ne01 = src0->ne[1];
 5942    const int ne02 = src0->ne[2];
 5943    const int ne03 = src0->ne[3];
 5944
 5945    const cl_ulong nb00 = src0->nb[0];
 5946    const cl_ulong nb01 = src0->nb[1];
 5947    const cl_ulong nb02 = src0->nb[2];
 5948    const cl_ulong nb03 = src0->nb[3];
 5949
 5950    const int ne10 = src1->ne[0];
 5951    const int ne11 = src1->ne[1];
 5952    const int ne12 = src1->ne[2];
 5953    const int ne13 = src1->ne[3];
 5954
 5955    const cl_ulong nb10 = src1->nb[0];
 5956    const cl_ulong nb11 = src1->nb[1];
 5957    const cl_ulong nb12 = src1->nb[2];
 5958    const cl_ulong nb13 = src1->nb[3];
 5959
 5960    const int ne0  = dst->ne[0];
 5961
 5962    const cl_ulong nb0  = dst->nb[0];
 5963    const cl_ulong nb1  = dst->nb[1];
 5964    const cl_ulong nb2  = dst->nb[2];
 5965    const cl_ulong nb3  = dst->nb[3];
 5966
 5967    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 5968
 5969    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 5970    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 5971    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 5972
 5973    cl_ulong offset0 = extra0->offset + src0->view_offs;
 5974    cl_ulong offset1 = extra1->offset + src1->view_offs;
 5975    cl_ulong offsetd = extrad->offset + dst->view_offs;
 5976
 5977    bool bcast_row = false;
 5978    cl_kernel kernel;
 5979
 5980    if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
 5981        GGML_ASSERT(ggml_is_contiguous(src0));
 5982
 5983        // src1 is a row
 5984        GGML_ASSERT(ne11 == 1);
 5985
 5986        bcast_row = true;
 5987        int ne = ne00 / 4;
 5988
 5989        if (src0->type == GGML_TYPE_F32) {
 5990            kernel = backend_ctx->kernel_sub_row;
 5991        } else {
 5992            kernel = backend_ctx->kernel_sub_row_f16;
 5993        }
 5994
 5995        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 5996        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 5997        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
 5998        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
 5999        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
 6000        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
 6001        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne));
 6002    } else {
 6003        if (src0->type == GGML_TYPE_F32) {
 6004            kernel = backend_ctx->kernel_sub;
 6005        } else {
 6006            kernel = backend_ctx->kernel_sub_f16;
 6007        }
 6008
 6009        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 6010        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 6011        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 6012        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 6013        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 6014        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 6015        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &nb00));
 6016        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
 6017        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
 6018        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb03));
 6019        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne10));
 6020        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne11));
 6021        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne12));
 6022        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne13));
 6023        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
 6024        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
 6025        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
 6026        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
 6027        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne0));
 6028        CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
 6029        CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
 6030        CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
 6031        CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
 6032    }
 6033
 6034    if (bcast_row) {
 6035        int n = ggml_nelements(dst)/4;
 6036        size_t global_work_size[] = {(size_t)n, 1, 1};
 6037        size_t local_work_size[] = {64, 1, 1};
 6038
 6039        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 6040    } else {
 6041        unsigned int nth = MIN(64, ne0);
 6042        size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
 6043        size_t local_work_size[] = {nth, 1, 1};
 6044
 6045        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 6046    }
 6047}
 6048
 6049static void ggml_cl_sqr(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6050    GGML_ASSERT(src0);
 6051    GGML_ASSERT(src0->extra);
 6052    GGML_ASSERT(dst);
 6053    GGML_ASSERT(dst->extra);
 6054    UNUSED(src1);
 6055
 6056    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6057
 6058    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6059    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6060
 6061    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6062    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6063
 6064    cl_kernel kernel;
 6065
 6066    // Currently assumes src0 is contiguous
 6067    int n = ggml_nelements(dst);
 6068    if (n % 4 == 0) {
 6069        if (src0->type == GGML_TYPE_F32) {
 6070            kernel = backend_ctx->kernel_sqr_cont_f32_4;
 6071        } else {
 6072            kernel = backend_ctx->kernel_sqr_cont_f16_4;
 6073        }
 6074        n /= 4;
 6075    } else {
 6076        if (src0->type == GGML_TYPE_F32) {
 6077            kernel = backend_ctx->kernel_sqr_cont_f32;
 6078        } else {
 6079            kernel = backend_ctx->kernel_sqr_cont_f16;
 6080        }
 6081    }
 6082
 6083    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 6084    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6085    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 6086    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 6087
 6088    size_t global_work_size[] = {(size_t)n, 1, 1};
 6089    size_t local_work_size[] = {64, 1, 1};
 6090
 6091    size_t * local_work_size_ptr = local_work_size;
 6092    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
 6093        local_work_size_ptr = nullptr;
 6094    }
 6095
 6096    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 6097}
 6098
 6099static void ggml_cl_sqrt(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6100    GGML_ASSERT(src0);
 6101    GGML_ASSERT(src0->extra);
 6102    GGML_ASSERT(dst);
 6103    GGML_ASSERT(dst->extra);
 6104    UNUSED(src1);
 6105
 6106    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6107
 6108    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6109    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6110
 6111    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6112    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6113
 6114    cl_kernel kernel;
 6115
 6116    // Currently assumes src0 is contiguous
 6117    int n = ggml_nelements(dst);
 6118    if (n % 4 == 0) {
 6119        if (src0->type == GGML_TYPE_F32) {
 6120            kernel = backend_ctx->kernel_sqrt_cont_f32_4;
 6121        } else {
 6122            kernel = backend_ctx->kernel_sqrt_cont_f16_4;
 6123        }
 6124        n /= 4;
 6125    } else {
 6126        if (src0->type == GGML_TYPE_F32) {
 6127            kernel = backend_ctx->kernel_sqrt_cont_f32;
 6128        } else {
 6129            kernel = backend_ctx->kernel_sqrt_cont_f16;
 6130        }
 6131    }
 6132
 6133    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 6134    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6135    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 6136    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 6137
 6138    size_t global_work_size[] = {(size_t)n, 1, 1};
 6139    size_t local_work_size[] = {64, 1, 1};
 6140
 6141    size_t * local_work_size_ptr = local_work_size;
 6142    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
 6143        local_work_size_ptr = nullptr;
 6144    }
 6145
 6146    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 6147}
 6148
 6149static void ggml_cl_mean(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6150    GGML_ASSERT(src0);
 6151    GGML_ASSERT(src0->extra);
 6152    GGML_ASSERT(dst);
 6153    GGML_ASSERT(dst->extra);
 6154    GGML_UNUSED(src1);
 6155
 6156    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
 6157    GGML_ASSERT(ggml_is_contiguous(src0));
 6158
 6159    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6160
 6161    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6162    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6163
 6164    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6165    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6166
 6167    const int ne00 = src0->ne[0];
 6168    const int ne01 = src0->ne[1];
 6169    const int ne02 = src0->ne[2];
 6170    const int ne03 = src0->ne[3];
 6171
 6172    const cl_ulong nb01 = src0->nb[1];
 6173    const cl_ulong nb02 = src0->nb[2];
 6174    const cl_ulong nb03 = src0->nb[3];
 6175
 6176    const cl_ulong nb1  = dst->nb[1];
 6177    const cl_ulong nb2  = dst->nb[2];
 6178    const cl_ulong nb3  = dst->nb[3];
 6179
 6180    cl_kernel kernel = backend_ctx->kernel_mean_f32;
 6181
 6182    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 6183    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 6184    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extrad->data_device));
 6185    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offsetd));
 6186    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne00));
 6187    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),      &ne01));
 6188    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne02));
 6189    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne03));
 6190    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb01));
 6191    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb02));
 6192    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
 6193    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
 6194    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
 6195    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
 6196
 6197    size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
 6198    size_t local_work_size[] = {(size_t)64, 1, 1};
 6199
 6200    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 6201}
 6202
 6203static void ggml_cl_ssm_conv(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6204    GGML_ASSERT(src0);
 6205    GGML_ASSERT(src0->extra);
 6206    GGML_ASSERT(src1);
 6207    GGML_ASSERT(src1->extra);
 6208    GGML_ASSERT(dst);
 6209    GGML_ASSERT(dst->extra);
 6210
 6211    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6212
 6213    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6214    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 6215    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6216
 6217    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6218    cl_ulong offset1 = extra1->offset + src1->view_offs;
 6219    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6220
 6221    int ne01 = src0->ne[1];
 6222    cl_ulong nb00 = src0->nb[0];
 6223    cl_ulong nb01 = src0->nb[1];
 6224    cl_ulong nb02 = src0->nb[2];
 6225
 6226    int ne10 = src1->ne[0];
 6227    cl_ulong nb11 = src1->nb[1];
 6228
 6229    int ne1  = dst->ne[1];
 6230    int ne2  = dst->ne[2];
 6231    cl_ulong nb0 = dst->nb[0];
 6232    cl_ulong nb1 = dst->nb[1];
 6233    cl_ulong nb2 = dst->nb[2];
 6234
 6235    cl_kernel kernel = backend_ctx->kernel_ssm_conv_f32_f32;
 6236
 6237    if (ne10 % 4 == 0) {
 6238        kernel = backend_ctx->kernel_ssm_conv_f32_f32_4;
 6239    }
 6240
 6241    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 6242    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 6243    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 6244    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 6245    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 6246    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 6247    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &nb00));
 6248    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
 6249    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
 6250    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
 6251    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb11));
 6252    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb0));
 6253    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1));
 6254    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2));
 6255
 6256    size_t global_work_size[] = {(size_t)ne01, (size_t)ne1, (size_t)ne2};
 6257    size_t local_work_size[]  = {64, 1, 1};
 6258
 6259    size_t * local_work_size_ptr = local_work_size;
 6260    if (ne01 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
 6261        local_work_size_ptr = nullptr;
 6262    }
 6263
 6264    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 6265}
 6266
 6267static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6268    GGML_ASSERT(src0);
 6269    GGML_ASSERT(src0->extra);
 6270    GGML_ASSERT(dst);
 6271    GGML_ASSERT(dst->extra);
 6272
 6273    UNUSED(src1);
 6274
 6275    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6276
 6277    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6278    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6279
 6280    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6281    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6282
 6283    cl_kernel kernel;
 6284
 6285    int n = ggml_nelements(dst);
 6286
 6287    if (n % 4 == 0) {
 6288        kernel = backend_ctx->kernel_gelu_4;
 6289        n /= 4;
 6290    } else {
 6291        kernel = backend_ctx->kernel_gelu;
 6292    }
 6293
 6294    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 6295    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6296    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 6297    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 6298
 6299    size_t global_work_size[] = {(size_t)n, 1, 1};
 6300    size_t local_work_size[] = {64, 1, 1};
 6301
 6302    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 6303}
 6304
 6305static void ggml_cl_gelu_erf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6306    GGML_ASSERT(src0);
 6307    GGML_ASSERT(src0->extra);
 6308    GGML_ASSERT(dst);
 6309    GGML_ASSERT(dst->extra);
 6310
 6311    UNUSED(src1);
 6312
 6313    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6314
 6315    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6316    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6317
 6318    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6319    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6320
 6321    cl_kernel kernel;
 6322
 6323    int n = ggml_nelements(dst);
 6324
 6325    if (n % 4 == 0) {
 6326        kernel = backend_ctx->kernel_gelu_erf_4;
 6327        n /= 4;
 6328    } else {
 6329        kernel = backend_ctx->kernel_gelu_erf;
 6330    }
 6331
 6332    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 6333    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6334    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 6335    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 6336
 6337    size_t global_work_size[] = {(size_t)n, 1, 1};
 6338    size_t local_work_size[] = {64, 1, 1};
 6339
 6340    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 6341}
 6342
 6343static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6344    GGML_ASSERT(src0);
 6345    GGML_ASSERT(src0->extra);
 6346    GGML_ASSERT(dst);
 6347    GGML_ASSERT(dst->extra);
 6348
 6349    UNUSED(src1);
 6350
 6351    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6352
 6353    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6354    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6355
 6356    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6357    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6358
 6359    cl_kernel kernel;
 6360
 6361    int n = ggml_nelements(dst);
 6362
 6363    if (n % 4 == 0) {
 6364        kernel = backend_ctx->kernel_gelu_quick_4;
 6365        n /= 4;
 6366    } else {
 6367        kernel = backend_ctx->kernel_gelu_quick;
 6368    }
 6369
 6370    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 6371    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6372    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 6373    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 6374
 6375    size_t global_work_size[] = {(size_t)n, 1, 1};
 6376    size_t local_work_size[] = {64, 1, 1};
 6377
 6378    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 6379}
 6380
 6381static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6382    GGML_ASSERT(src0);
 6383    GGML_ASSERT(src0->extra);
 6384    GGML_ASSERT(dst);
 6385    GGML_ASSERT(dst->extra);
 6386
 6387    UNUSED(src1);
 6388
 6389    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6390
 6391    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6392    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6393
 6394    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6395    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6396
 6397    cl_kernel kernel;
 6398
 6399    int n = ggml_nelements(dst);
 6400
 6401    if (n % 4 == 0) {
 6402        kernel = backend_ctx->kernel_silu_4;
 6403        n /= 4;
 6404    } else {
 6405        kernel = backend_ctx->kernel_silu;
 6406    }
 6407
 6408    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 6409    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6410    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 6411    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 6412
 6413    size_t global_work_size[] = {(size_t)n, 1, 1};
 6414    size_t local_work_size[] = {64, 1, 1};
 6415
 6416    size_t * local_work_size_ptr = local_work_size;
 6417    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
 6418        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
 6419    }
 6420
 6421    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 6422}
 6423
 6424static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6425    GGML_ASSERT(src0);
 6426    GGML_ASSERT(src0->extra);
 6427    GGML_ASSERT(dst);
 6428    GGML_ASSERT(dst->extra);
 6429
 6430    UNUSED(src1);
 6431
 6432    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6433
 6434    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6435    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6436
 6437    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6438    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6439
 6440    cl_kernel kernel = backend_ctx->kernel_relu;
 6441
 6442    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 6443    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6444    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 6445    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 6446
 6447    const int64_t n = ggml_nelements(dst);
 6448
 6449    size_t global_work_size[] = {(size_t)n, 1, 1};
 6450    size_t local_work_size[] = {64, 1, 1};
 6451
 6452    size_t * local_work_size_ptr = local_work_size;
 6453    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
 6454        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
 6455    }
 6456
 6457    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 6458}
 6459
 6460static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6461    GGML_ASSERT(src0);
 6462    GGML_ASSERT(src0->extra);
 6463    GGML_ASSERT(dst);
 6464    GGML_ASSERT(dst->extra);
 6465
 6466    UNUSED(src1);
 6467
 6468    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6469
 6470    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6471    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6472
 6473    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6474    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6475
 6476    cl_kernel kernel;
 6477    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
 6478        kernel = backend_ctx->kernel_sigmoid_f32;
 6479    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
 6480        kernel = backend_ctx->kernel_sigmoid_f16;
 6481    } else {
 6482        GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
 6483    }
 6484
 6485    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 6486    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6487    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 6488    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 6489
 6490    const int64_t n = ggml_nelements(dst);
 6491
 6492    size_t global_work_size[] = {(size_t)n, 1, 1};
 6493    size_t local_work_size[] = {64, 1, 1};
 6494
 6495    size_t * local_work_size_ptr = local_work_size;
 6496    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
 6497        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
 6498    }
 6499
 6500    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 6501}
 6502
 6503static void ggml_cl_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6504    GGML_ASSERT(src0);
 6505    GGML_ASSERT(src0->extra);
 6506    GGML_ASSERT(dst);
 6507    GGML_ASSERT(dst->extra);
 6508
 6509    UNUSED(src1);
 6510
 6511    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6512
 6513    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6514    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6515
 6516    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6517    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6518
 6519    const int tri_type = ggml_get_op_params_i32(dst, 0);
 6520    const int64_t n = ggml_nelements(dst);
 6521    const int     ne0  = dst->ne[0];
 6522    const int     ne1  = dst->ne[1];
 6523
 6524    cl_kernel kernel = backend_ctx->kernel_tri;
 6525
 6526    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 6527    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6528    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 6529    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 6530    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &n));
 6531    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne0));
 6532    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne1));
 6533    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &tri_type));
 6534
 6535    size_t local_work_size[1] = { 256 };
 6536    size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] };
 6537
 6538    backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
 6539}
 6540
 6541static void ggml_cl_fill(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6542    GGML_ASSERT(dst);
 6543    GGML_ASSERT(dst->extra);
 6544
 6545    UNUSED(src0);
 6546    UNUSED(src1);
 6547
 6548    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6549
 6550    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6551    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6552
 6553    float v = 0.0f;
 6554    memcpy(&v, ((int32_t *) dst->op_params), sizeof(float));
 6555
 6556    const int64_t n = ggml_nelements(dst);
 6557
 6558    cl_kernel kernel = backend_ctx->kernel_fill;
 6559
 6560    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extrad->data_device));
 6561    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offsetd));
 6562    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(float),    &v));
 6563    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(float),    &n));
 6564
 6565    size_t local_work_size[1] = { 256 };
 6566    size_t global_work_size[1] = { ((size_t)n + local_work_size[0] - 1) / local_work_size[0] * local_work_size[0] };
 6567
 6568    backend_ctx->enqueue_ndrange_kernel(kernel, 1, global_work_size, local_work_size, dst);
 6569}
 6570
 6571static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6572    GGML_ASSERT(src0);
 6573    GGML_ASSERT(src0->extra);
 6574    GGML_ASSERT(dst);
 6575    GGML_ASSERT(dst->extra);
 6576
 6577    UNUSED(src1);
 6578
 6579    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6580
 6581    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6582    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6583
 6584    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6585    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6586
 6587    float min;
 6588    float max;
 6589    memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
 6590    memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
 6591
 6592    cl_kernel kernel = backend_ctx->kernel_clamp;
 6593
 6594    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 6595    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6596    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 6597    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 6598    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float),    &min));
 6599    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float),    &max));
 6600
 6601    const int64_t n = ggml_nelements(dst);
 6602
 6603    size_t global_work_size[] = {(size_t)n, 1, 1};
 6604    size_t local_work_size[] = {64, 1, 1};
 6605
 6606    size_t * local_work_size_ptr = local_work_size;
 6607    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
 6608        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
 6609    }
 6610
 6611    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 6612}
 6613
 6614static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6615    GGML_ASSERT(src0);
 6616    GGML_ASSERT(src0->extra);
 6617    GGML_ASSERT(dst);
 6618    GGML_ASSERT(dst->extra);
 6619
 6620    UNUSED(src1);
 6621
 6622    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6623
 6624    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6625    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6626
 6627    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6628    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6629
 6630    float eps;
 6631    memcpy(&eps, dst->op_params, sizeof(float));
 6632
 6633    const int ne00 = src0 ? src0->ne[0] : 0;
 6634    const int ne01 = src0 ? src0->ne[1] : 0;
 6635    const int ne02 = src0 ? src0->ne[2] : 0;
 6636    const int ne03 = src0 ? src0->ne[3] : 0;
 6637
 6638    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
 6639    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
 6640    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
 6641
 6642    const int nth = MIN(64, ne00);
 6643
 6644    cl_kernel kernel = backend_ctx->kernel_norm;
 6645
 6646    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),    &extra0->data_device));
 6647    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong),  &offset0));
 6648    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),    &extrad->data_device));
 6649    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong),  &offsetd));
 6650    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),       &ne00));
 6651    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),       &ne01));
 6652    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),       &ne02));
 6653    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),       &ne03));
 6654    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong),  &nb01));
 6655    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong),  &nb02));
 6656    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),  &nb03));
 6657    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float),     &eps));
 6658    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
 6659
 6660    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
 6661    size_t local_work_size[] = {(size_t)nth, 1, 1};
 6662
 6663    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 6664}
 6665
 6666static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6667    GGML_ASSERT(src0);
 6668    GGML_ASSERT(src0->extra);
 6669    GGML_ASSERT(dst);
 6670    GGML_ASSERT(dst->extra);
 6671
 6672    UNUSED(src1);
 6673
 6674    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6675
 6676    //ggml_backend_opencl_device_context * dev_ctx =
 6677    //    (ggml_backend_opencl_device_context *)backend->device->context;
 6678
 6679    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6680    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6681
 6682    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6683    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6684
 6685    float eps;
 6686    memcpy(&eps, dst->op_params, sizeof(float));
 6687
 6688    const int ne00 = src0 ? src0->ne[0] : 0;
 6689    const int ne01 = src0 ? src0->ne[1] : 0;
 6690    const int ne02 = src0 ? src0->ne[2] : 0;
 6691    const int ne03 = src0 ? src0->ne[3] : 0;
 6692
 6693    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
 6694    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
 6695    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
 6696
 6697    GGML_ASSERT(ne00 % 4 == 0);
 6698
 6699    const int nth = MIN(64, ne00);
 6700
 6701    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
 6702    size_t local_work_size[] = {(size_t)nth, 1, 1};
 6703
 6704    cl_kernel kernel = backend_ctx->kernel_rms_norm;
 6705
 6706    // Note, this kernel declares local memory in kernel args and the size
 6707    // depends on subgroup size.
 6708    // Note, this requires OpenCL 2.1 and above
 6709    // For now we use fixed subgroup size to simplify support for OpenCL 2.0.
 6710    size_t sgs;
 6711    //CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
 6712    //    CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
 6713    //    sizeof(local_work_size), local_work_size,
 6714    //    sizeof(size_t), &sgs, NULL));
 6715    if (backend_ctx->gpu_family == ADRENO) {
 6716        sgs = 64;
 6717    } else if (backend_ctx->gpu_family == INTEL) {
 6718        sgs = 32;
 6719    } else {
 6720        GGML_ASSERT(false && "Unsupported GPU");
 6721    }
 6722
 6723    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),    &extra0->data_device));
 6724    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong),  &offset0));
 6725    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),    &extrad->data_device));
 6726    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong),  &offsetd));
 6727    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),       &ne00));
 6728    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),       &ne01));
 6729    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),       &ne02));
 6730    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),       &ne03));
 6731    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong),  &nb01));
 6732    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong),  &nb02));
 6733    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),  &nb03));
 6734    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float),     &eps));
 6735    // This is local memory - the size depends on subgroup size.
 6736    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs,  NULL));
 6737
 6738    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 6739}
 6740
 6741static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor * rms_norm_tensor, ggml_tensor * mul_tensor) {
 6742    GGML_ASSERT(mul_tensor);
 6743    GGML_ASSERT(rms_norm_tensor);
 6744
 6745    // src0 is the src of rms_norm, src1 is the other src of mul (one being rms_norm)
 6746    const ggml_tensor * src0 = rms_norm_tensor->src[0];
 6747    const ggml_tensor * src1;
 6748    if (mul_tensor->src[0] == rms_norm_tensor) {
 6749        src1 = mul_tensor->src[1];
 6750    } else if (mul_tensor->src[1] == rms_norm_tensor) {
 6751        src1 = mul_tensor->src[0];
 6752    } else {
 6753        GGML_ASSERT(false && "Invalid args for rms_norm and mul");
 6754    }
 6755    const ggml_tensor * dst = mul_tensor;
 6756
 6757    GGML_ASSERT(src0);
 6758    GGML_ASSERT(src0->extra);
 6759    GGML_ASSERT(src1);
 6760    GGML_ASSERT(src1->extra);
 6761    GGML_ASSERT(dst);
 6762    GGML_ASSERT(dst->extra);
 6763
 6764    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6765    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 6766    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6767
 6768    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6769    cl_ulong offset1 = extra1->offset + src0->view_offs;
 6770    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6771
 6772    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6773
 6774    float eps;
 6775    memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
 6776
 6777    const int ne00 = src0->ne[0];
 6778    const int ne01 = src0->ne[1];
 6779    const int ne02 = src0->ne[2];
 6780    const int ne03 = src0->ne[3];
 6781
 6782    const cl_ulong nb01 = src0->nb[1];
 6783    const cl_ulong nb02 = src0->nb[2];
 6784    const cl_ulong nb03 = src0->nb[3];
 6785
 6786    const int ne10 = src1->ne[0];
 6787    const int ne11 = src1->ne[1];
 6788    const int ne12 = src1->ne[2];
 6789    const int ne13 = src1->ne[3];
 6790
 6791    const cl_ulong nb11 = src1->nb[1];
 6792    const cl_ulong nb12 = src1->nb[2];
 6793    const cl_ulong nb13 = src1->nb[3];
 6794
 6795    const cl_ulong nb1 = dst->nb[1];
 6796    const cl_ulong nb2 = dst->nb[2];
 6797    const cl_ulong nb3 = dst->nb[3];
 6798
 6799    GGML_ASSERT(ne00 % 4 == 0);
 6800
 6801    size_t sgs;
 6802    if (backend_ctx->gpu_family == ADRENO) {
 6803        sgs = 64;
 6804    } else if (backend_ctx->gpu_family == INTEL) {
 6805        sgs = 32;
 6806    } else {
 6807        GGML_ASSERT(false && "Unsupported GPU");
 6808    }
 6809
 6810    cl_kernel kernel = backend_ctx->kernel_rms_norm_mul;
 6811
 6812    int nth = sgs;
 6813    int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
 6814    while (nth < ne00 && nth < max_workgroup_size) {
 6815        nth *= 2;
 6816    }
 6817    nth = MIN(nth, max_workgroup_size);
 6818    nth = MIN(nth, ne00);
 6819
 6820    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
 6821    size_t local_work_size[] = {(size_t)nth, 1, 1};
 6822
 6823    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),        &extra0->data_device));
 6824    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong),      &offset0));
 6825    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),        &extra1->data_device));
 6826    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong),      &offset1));
 6827    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),        &extrad->data_device));
 6828    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong),      &offsetd));
 6829    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),           &ne00));
 6830    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),           &ne01));
 6831    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),           &ne02));
 6832    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),           &ne03));
 6833    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),      &nb01));
 6834    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),      &nb02));
 6835    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong),      &nb03));
 6836    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),           &ne10));
 6837    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),           &ne11));
 6838    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),           &ne12));
 6839    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),           &ne13));
 6840    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),      &nb11));
 6841    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),      &nb12));
 6842    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),      &nb13));
 6843    CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong),      &nb1));
 6844    CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong),      &nb2));
 6845    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong),      &nb3));
 6846    CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float),         &eps));
 6847    CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,     NULL));
 6848
 6849    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 6850}
 6851
 6852static void ggml_opencl_op_norm_fused(ggml_backend_t backend, ggml_tensor * norm_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) {
 6853    GGML_ASSERT(norm_tensor && mul_tensor && add_tensor);
 6854
 6855    const ggml_tensor * src0 = norm_tensor->src[0];
 6856    const ggml_tensor * src1 = mul_tensor->src[0] == norm_tensor ? mul_tensor->src[1] : mul_tensor->src[0];
 6857    const ggml_tensor * src2 = add_tensor->src[0] == mul_tensor ? add_tensor->src[1] : add_tensor->src[0];
 6858    const ggml_tensor * dst = add_tensor;
 6859
 6860    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6861    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 6862    ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
 6863    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6864
 6865    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6866    cl_ulong offset1 = extra1->offset + src1->view_offs;
 6867    cl_ulong offset2 = extra2->offset + src2->view_offs;
 6868    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6869
 6870    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6871
 6872    float eps;
 6873    memcpy(&eps, norm_tensor->op_params, sizeof(float));
 6874
 6875    const int ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
 6876    const cl_ulong nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
 6877    const int ne10 = src1->ne[0], ne11 = src1->ne[1], ne12 = src1->ne[2], ne13 = src1->ne[3];
 6878    const cl_ulong nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
 6879    const int ne20 = src2->ne[0], ne21 = src2->ne[1], ne22 = src2->ne[2], ne23 = src2->ne[3];
 6880    const cl_ulong nb21 = src2->nb[1], nb22 = src2->nb[2], nb23 = src2->nb[3];
 6881    const cl_ulong nbd1 = dst->nb[1], nbd2 = dst->nb[2], nbd3 = dst->nb[3];
 6882
 6883    size_t sgs;
 6884    if (backend_ctx->gpu_family == ADRENO) sgs = 64;
 6885    else if (backend_ctx->gpu_family == INTEL) sgs = 32;
 6886    else GGML_ASSERT(false && "Unsupported GPU");
 6887
 6888    cl_kernel kernel = backend_ctx->kernel_norm_mul_add;
 6889
 6890    int nth = sgs;
 6891    int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
 6892    while (nth < ne00/4 && nth < max_workgroup_size) nth *= 2;
 6893    nth = MIN(nth, max_workgroup_size);
 6894    nth = MIN(nth, ne00/4);
 6895
 6896    size_t gws[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
 6897    size_t lws[] = {(size_t)nth, 1, 1};
 6898    size_t num_subgroups = (nth + sgs - 1) / sgs;
 6899
 6900    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
 6901    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6902    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
 6903    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
 6904    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
 6905    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
 6906    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
 6907    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
 6908    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
 6909    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
 6910    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
 6911    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne03));
 6912    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb01));
 6913    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb02));
 6914    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb03));
 6915    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne10));
 6916    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne11));
 6917    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne12));
 6918    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne13));
 6919    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11));
 6920    CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12));
 6921    CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13));
 6922    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne20));
 6923    CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne21));
 6924    CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne22));
 6925    CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne23));
 6926    CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb21));
 6927    CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb22));
 6928    CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb23));
 6929    CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nbd1));
 6930    CL_CHECK(clSetKernelArg(kernel, 30, sizeof(cl_ulong), &nbd2));
 6931    CL_CHECK(clSetKernelArg(kernel, 31, sizeof(cl_ulong), &nbd3));
 6932    CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &eps));
 6933    CL_CHECK(clSetKernelArg(kernel, 33, sizeof(cl_float2) * num_subgroups, NULL));
 6934
 6935    backend_ctx->enqueue_ndrange_kernel(kernel, 3, gws, lws, dst);
 6936}
 6937
 6938static void ggml_opencl_op_group_norm_fused(ggml_backend_t backend, ggml_tensor * gn_tensor, ggml_tensor * mul_tensor, ggml_tensor * add_tensor) {
 6939    GGML_ASSERT(gn_tensor && mul_tensor && add_tensor);
 6940
 6941    const ggml_tensor * src0 = gn_tensor->src[0];
 6942    const ggml_tensor * src1 = mul_tensor->src[0] == gn_tensor ? mul_tensor->src[1] : mul_tensor->src[0];
 6943    const ggml_tensor * src2 = add_tensor->src[0] == mul_tensor ? add_tensor->src[1] : add_tensor->src[0];
 6944    const ggml_tensor * dst = add_tensor;
 6945
 6946    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6947    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 6948    ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
 6949    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6950
 6951    cl_ulong offset0 = extra0->offset + src0->view_offs;
 6952    cl_ulong offset1 = extra1->offset + src1->view_offs;
 6953    cl_ulong offset2 = extra2->offset + src2->view_offs;
 6954    cl_ulong offsetd = extrad->offset + dst->view_offs;
 6955
 6956    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6957
 6958    int groups;
 6959    float eps;
 6960    memcpy(&groups, gn_tensor->op_params, sizeof(int));
 6961    memcpy(&eps, (char *)gn_tensor->op_params + sizeof(int), sizeof(float));
 6962
 6963    cl_kernel kernel = backend_ctx->kernel_group_norm_mul_add;
 6964    int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
 6965    int ne = ggml_nelements(src0);
 6966    int group_size = ne / groups;
 6967
 6968    size_t lws[] = { (size_t)MIN(max_workgroup_size, group_size) };
 6969    size_t gws[] = { (size_t)groups * lws[0] };
 6970
 6971    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
 6972    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 6973    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
 6974    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
 6975    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
 6976    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
 6977    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
 6978    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
 6979    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne));
 6980    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &group_size));
 6981    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &eps));
 6982
 6983    backend_ctx->enqueue_ndrange_kernel(kernel, 1, gws, lws, dst);
 6984}
 6985
 6986static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 6987    GGML_ASSERT(src0);
 6988    GGML_ASSERT(src0->extra);
 6989    GGML_ASSERT(dst);
 6990    GGML_ASSERT(dst->extra);
 6991
 6992    UNUSED(src1);
 6993
 6994    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 6995
 6996    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 6997    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 6998
 6999    cl_ulong offset0 = extra0->offset + src0->view_offs;
 7000    cl_ulong offsetd = extrad->offset + dst->view_offs;
 7001
 7002    int32_t n_groups   = ((const int32_t *) dst->op_params)[0];
 7003    int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
 7004    float   eps        = ((const float *) dst->op_params)[1];
 7005
 7006    const int ne00 = src0->ne[0];
 7007    const int ne01 = src0->ne[1];
 7008    const int ne02 = src0->ne[2];
 7009    const int ne = ne00*ne01*ne02;
 7010
 7011    cl_kernel kernel = backend_ctx->kernel_group_norm;
 7012
 7013    size_t sgs = 64;
 7014    if (backend_ctx->gpu_family == ADRENO) {
 7015        sgs = 64;
 7016    } else if (backend_ctx->gpu_family == INTEL) {
 7017        sgs = 32;
 7018    } else {
 7019        GGML_ASSERT(false && "Unsupported GPU");
 7020    }
 7021
 7022    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 7023    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 7024    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 7025    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 7026    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne));
 7027    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &group_size));
 7028    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float),    &eps));
 7029
 7030    size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
 7031    size_t local_work_size[] = {(size_t)sgs, 1, 1};
 7032
 7033    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 7034}
 7035
 7036static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 7037    GGML_ASSERT(src0);
 7038    GGML_ASSERT(src0->extra);
 7039    GGML_ASSERT(dst);
 7040    GGML_ASSERT(dst->extra);
 7041
 7042    UNUSED(src1);
 7043
 7044    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 7045
 7046    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 7047    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 7048
 7049    cl_ulong offset0 = extra0->offset + src0->view_offs;
 7050    cl_ulong offsetd = extrad->offset + dst->view_offs;
 7051
 7052    const int ne00 = src0->ne[0];
 7053    const int ne01 = src0->ne[1];
 7054    const int ne02 = src0->ne[2];
 7055    const int ne03 = src0->ne[3];
 7056
 7057    const cl_ulong nb00 = src0->nb[0];
 7058    const cl_ulong nb01 = src0->nb[1];
 7059    const cl_ulong nb02 = src0->nb[2];
 7060    const cl_ulong nb03 = src0->nb[3];
 7061
 7062    const cl_ulong nb0  = dst->nb[0];
 7063    const cl_ulong nb1  = dst->nb[1];
 7064    const cl_ulong nb2  = dst->nb[2];
 7065    const cl_ulong nb3  = dst->nb[3];
 7066
 7067    cl_kernel kernel;
 7068
 7069    if (ggml_is_contiguous(src0)) {
 7070        // Handle contiguous input
 7071        int n = ggml_nelements(dst);
 7072        if (n % 4 == 0) {
 7073            if (src0->type == GGML_TYPE_F32) {
 7074                kernel = backend_ctx->kernel_tanh_f32_4;
 7075            } else {
 7076                kernel = backend_ctx->kernel_tanh_f16_4;
 7077            }
 7078            n /= 4;
 7079        } else {
 7080            if (src0->type == GGML_TYPE_F32) {
 7081                kernel = backend_ctx->kernel_tanh_f32;
 7082            } else {
 7083                kernel = backend_ctx->kernel_tanh_f16;
 7084            }
 7085        }
 7086
 7087        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 7088        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 7089        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 7090        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 7091
 7092        size_t global_work_size[] = {(size_t)n, 1, 1};
 7093        size_t local_work_size[] = {64, 1, 1};
 7094
 7095        size_t * local_work_size_ptr = local_work_size;
 7096        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
 7097            local_work_size_ptr = nullptr;
 7098        }
 7099
 7100        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 7101    } else {
 7102        // Handle non-contiguous input
 7103        if (src0->type == GGML_TYPE_F32) {
 7104            kernel = backend_ctx->kernel_tanh_f32_nc;
 7105        } else {
 7106            kernel = backend_ctx->kernel_tanh_f16_nc;
 7107        }
 7108
 7109        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 7110        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 7111        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extrad->data_device));
 7112        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offsetd));
 7113        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne00));
 7114        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &nb00));
 7115        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &nb01));
 7116        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb02));
 7117        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb03));
 7118        CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb0));
 7119        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb1));
 7120        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb2));
 7121        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb3));
 7122
 7123        int nth = 64;
 7124
 7125        size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
 7126        size_t local_work_size[] = {(size_t)nth, 1, 1};
 7127
 7128        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 7129    }
 7130}
 7131
 7132static void ggml_cl_expm1(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 7133    GGML_ASSERT(src0);
 7134    GGML_ASSERT(src0->extra);
 7135    GGML_ASSERT(dst);
 7136    GGML_ASSERT(dst->extra);
 7137
 7138    UNUSED(src1);
 7139
 7140    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 7141
 7142    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 7143    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 7144
 7145    cl_ulong offset0_abs = extra0->offset + src0->view_offs;
 7146    cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
 7147
 7148    cl_kernel kernel;
 7149    if (dst->type == GGML_TYPE_F32) {
 7150        kernel = backend_ctx->kernel_expm1_f32_nd;
 7151    } else if (dst->type == GGML_TYPE_F16) {
 7152        kernel = backend_ctx->kernel_expm1_f16_nd;
 7153    } else {
 7154        GGML_ASSERT(false && "Unsupported type for ggml_cl_expm1");
 7155    }
 7156    GGML_ASSERT(kernel != nullptr);
 7157
 7158    const int ne00 = src0->ne[0];
 7159    const int ne01 = src0->ne[1];
 7160    const int ne02 = src0->ne[2];
 7161    const int ne03 = src0->ne[3];
 7162
 7163    const cl_ulong nb00 = src0->nb[0];
 7164    const cl_ulong nb01 = src0->nb[1];
 7165    const cl_ulong nb02 = src0->nb[2];
 7166    const cl_ulong nb03 = src0->nb[3];
 7167
 7168    const int ne10 = dst->ne[0];
 7169    const int ne11 = dst->ne[1];
 7170    const int ne12 = dst->ne[2];
 7171    const int ne13 = dst->ne[3];
 7172
 7173    const cl_ulong nb10 = dst->nb[0];
 7174    const cl_ulong nb11 = dst->nb[1];
 7175    const cl_ulong nb12 = dst->nb[2];
 7176    const cl_ulong nb13 = dst->nb[3];
 7177
 7178    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 7179    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
 7180    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 7181    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
 7182
 7183    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
 7184    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
 7185    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne02));
 7186    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &ne03));
 7187    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
 7188    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
 7189    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
 7190    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
 7191
 7192    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),     &ne10));
 7193    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),     &ne11));
 7194    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),     &ne12));
 7195    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),     &ne13));
 7196    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
 7197    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
 7198    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
 7199    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
 7200
 7201    size_t global_work_size[3];
 7202    if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
 7203        return;
 7204    }
 7205    global_work_size[0] = (size_t)ne10;
 7206    global_work_size[1] = (size_t)ne11;
 7207    global_work_size[2] = (size_t)ne12;
 7208
 7209    size_t lws0 = 16, lws1 = 4, lws2 = 1;
 7210    if (ne10 < 16) lws0 = ne10;
 7211    if (ne11 < 4) lws1 = ne11;
 7212    if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
 7213
 7214    while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
 7215    while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
 7216    while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
 7217
 7218
 7219    size_t local_work_size[] = {lws0, lws1, lws2};
 7220
 7221    size_t* local_work_size_ptr = local_work_size;
 7222    if (!backend_ctx->non_uniform_workgroups) {
 7223        if (global_work_size[0] % local_work_size[0] != 0 ||
 7224            global_work_size[1] % local_work_size[1] != 0 ||
 7225            global_work_size[2] % local_work_size[2] != 0) {
 7226            local_work_size_ptr = NULL;
 7227        }
 7228    }
 7229    if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
 7230
 7231    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 7232}
 7233
 7234static void ggml_cl_softplus(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 7235    GGML_ASSERT(src0);
 7236    GGML_ASSERT(src0->extra);
 7237    GGML_ASSERT(dst);
 7238    GGML_ASSERT(dst->extra);
 7239
 7240    UNUSED(src1);
 7241
 7242    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 7243
 7244    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 7245    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 7246
 7247    cl_ulong offset0_abs = extra0->offset + src0->view_offs;
 7248    cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
 7249
 7250    cl_kernel kernel;
 7251    if (dst->type == GGML_TYPE_F32) {
 7252        kernel = backend_ctx->kernel_softplus_f32_nd;
 7253    } else if (dst->type == GGML_TYPE_F16) {
 7254        kernel = backend_ctx->kernel_softplus_f16_nd;
 7255    } else {
 7256        GGML_ASSERT(false && "Unsupported type for ggml_cl_softplus");
 7257    }
 7258    GGML_ASSERT(kernel != nullptr);
 7259
 7260    const int ne00 = src0->ne[0];
 7261    const int ne01 = src0->ne[1];
 7262    const int ne02 = src0->ne[2];
 7263    const int ne03 = src0->ne[3];
 7264
 7265    const cl_ulong nb00 = src0->nb[0];
 7266    const cl_ulong nb01 = src0->nb[1];
 7267    const cl_ulong nb02 = src0->nb[2];
 7268    const cl_ulong nb03 = src0->nb[3];
 7269
 7270    const int ne10 = dst->ne[0];
 7271    const int ne11 = dst->ne[1];
 7272    const int ne12 = dst->ne[2];
 7273    const int ne13 = dst->ne[3];
 7274
 7275    const cl_ulong nb10 = dst->nb[0];
 7276    const cl_ulong nb11 = dst->nb[1];
 7277    const cl_ulong nb12 = dst->nb[2];
 7278    const cl_ulong nb13 = dst->nb[3];
 7279
 7280    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 7281    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
 7282    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 7283    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
 7284
 7285    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
 7286    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
 7287    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne02));
 7288    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &ne03));
 7289    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
 7290    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
 7291    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
 7292    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
 7293
 7294    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),     &ne10));
 7295    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),     &ne11));
 7296    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),     &ne12));
 7297    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),     &ne13));
 7298    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
 7299    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
 7300    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
 7301    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
 7302
 7303    size_t global_work_size[3];
 7304    if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
 7305        return;
 7306    }
 7307    global_work_size[0] = (size_t)ne10;
 7308    global_work_size[1] = (size_t)ne11;
 7309    global_work_size[2] = (size_t)ne12;
 7310
 7311    size_t lws0 = 16, lws1 = 4, lws2 = 1;
 7312    if (ne10 < 16) lws0 = ne10;
 7313    if (ne11 < 4) lws1 = ne11;
 7314    if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
 7315
 7316    while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
 7317    while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
 7318    while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
 7319
 7320
 7321    size_t local_work_size[] = {lws0, lws1, lws2};
 7322
 7323    size_t* local_work_size_ptr = local_work_size;
 7324    if (!backend_ctx->non_uniform_workgroups) {
 7325        if (global_work_size[0] % local_work_size[0] != 0 ||
 7326            global_work_size[1] % local_work_size[1] != 0 ||
 7327            global_work_size[2] % local_work_size[2] != 0) {
 7328            local_work_size_ptr = NULL;
 7329        }
 7330    }
 7331    if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
 7332
 7333    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 7334}
 7335
 7336static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
 7337    GGML_ASSERT(src0);
 7338    GGML_ASSERT(src0->extra);
 7339    GGML_ASSERT(dst);
 7340    GGML_ASSERT(dst->extra);
 7341    GGML_ASSERT(dst->type == src0->type);
 7342
 7343    UNUSED(src1_shape_def);
 7344
 7345    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 7346
 7347    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 7348    ggml_tensor_extra_cl * extrad  = (ggml_tensor_extra_cl *)dst->extra;
 7349
 7350    cl_ulong offset0 = extra0->offset + src0->view_offs;
 7351    cl_ulong offsetd  = extrad->offset + dst->view_offs;
 7352
 7353    const int ne00 = src0->ne[0];
 7354    const int ne01 = src0->ne[1];
 7355    const int ne02 = src0->ne[2];
 7356    const int ne03 = src0->ne[3];
 7357
 7358    const cl_ulong nb00 = src0->nb[0];
 7359    const cl_ulong nb01 = src0->nb[1];
 7360    const cl_ulong nb02 = src0->nb[2];
 7361    const cl_ulong nb03 = src0->nb[3];
 7362
 7363    const int ne0 = dst->ne[0];
 7364    const int ne1 = dst->ne[1];
 7365    const int ne2 = dst->ne[2];
 7366    const int ne3 = dst->ne[3];
 7367
 7368    const cl_ulong nb0 = dst->nb[0];
 7369    const cl_ulong nb1 = dst->nb[1];
 7370    const cl_ulong nb2 = dst->nb[2];
 7371    const cl_ulong nb3 = dst->nb[3];
 7372
 7373    cl_kernel kernel = backend_ctx->kernel_repeat_f32;
 7374
 7375    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 7376    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 7377    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extrad->data_device));
 7378    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offsetd));
 7379    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne00));
 7380    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),      &ne01));
 7381    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne02));
 7382    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne03));
 7383    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb00));
 7384    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb01));
 7385    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
 7386    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
 7387    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne0));
 7388    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb0));
 7389    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
 7390    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
 7391    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
 7392
 7393    int nth = 64;
 7394
 7395    size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
 7396    size_t local_work_size[] = {(size_t)nth, 1, 1};
 7397
 7398    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 7399}
 7400
 7401static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
 7402    GGML_ASSERT(src0);
 7403    GGML_ASSERT(src0->extra);
 7404    GGML_ASSERT(dst);
 7405    GGML_ASSERT(dst->extra);
 7406    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 7407    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 7408
 7409    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 7410
 7411    if (backend_ctx->kernel_pad == nullptr) {
 7412        GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
 7413        return;
 7414    }
 7415
 7416    ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
 7417    ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra;
 7418
 7419    cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
 7420    cl_ulong off_dst  = extra_dst->offset  + dst->view_offs;
 7421
 7422    const int s_ne0 = src0->ne[0];
 7423    const int s_ne1 = src0->ne[1];
 7424    const int s_ne2 = src0->ne[2];
 7425    const int s_ne3 = src0->ne[3];
 7426
 7427    const int s_nb0 = src0->nb[0];
 7428    const int s_nb1 = src0->nb[1];
 7429    const int s_nb2 = src0->nb[2];
 7430    const int s_nb3 = src0->nb[3];
 7431
 7432    const int d_ne0 = dst->ne[0];
 7433    const int d_ne1 = dst->ne[1];
 7434    const int d_ne2 = dst->ne[2];
 7435    const int d_ne3 = dst->ne[3];
 7436
 7437    const int d_nb0 = dst->nb[0];
 7438    const int d_nb1 = dst->nb[1];
 7439    const int d_nb2 = dst->nb[2];
 7440    const int d_nb3 = dst->nb[3];
 7441
 7442    const int lp0 = ((const int*)(dst->op_params))[0];
 7443    const int rp0 = ((const int*)(dst->op_params))[1];
 7444    const int lp1 = ((const int*)(dst->op_params))[2];
 7445    const int rp1 = ((const int*)(dst->op_params))[3];
 7446    const int lp2 = ((const int*)(dst->op_params))[4];
 7447    const int rp2 = ((const int*)(dst->op_params))[5];
 7448    const int lp3 = ((const int*)(dst->op_params))[6];
 7449    const int rp3 = ((const int*)(dst->op_params))[7];
 7450
 7451    cl_kernel kernel = backend_ctx->kernel_pad;
 7452
 7453    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),    &extra_src0->data_device));
 7454    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong),  &off_src0));
 7455    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),    &extra_dst->data_device));
 7456    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong),  &off_dst));
 7457    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),       &s_ne0));
 7458    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),       &s_ne1));
 7459    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),       &s_ne2));
 7460    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),       &s_ne3));
 7461    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong),  &s_nb0));
 7462    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong),  &s_nb1));
 7463    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),  &s_nb2));
 7464    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),  &s_nb3));
 7465    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),       &d_ne0));
 7466    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),       &d_ne1));
 7467    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),       &d_ne2));
 7468    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),       &d_ne3));
 7469    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),  &d_nb0));
 7470    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),  &d_nb1));
 7471    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),  &d_nb2));
 7472    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),  &d_nb3));
 7473    CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),       &lp0));
 7474    CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),       &rp0));
 7475    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),       &lp1));
 7476    CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),       &rp1));
 7477    CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),       &lp2));
 7478    CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int),       &rp2));
 7479    CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int),       &lp3));
 7480    CL_CHECK(clSetKernelArg(kernel, 27, sizeof(int),       &rp3));
 7481
 7482    size_t lws0 = 64;
 7483    size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
 7484
 7485    size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2*d_ne3 };
 7486    size_t local_work_size[]  = { lws0, 1, 1 };
 7487
 7488    size_t * local_work_size_ptr = local_work_size;
 7489     if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
 7490        local_work_size_ptr = nullptr;
 7491    }
 7492
 7493    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 7494}
 7495
 7496static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
 7497    GGML_ASSERT(src0);
 7498    GGML_ASSERT(src0->extra);
 7499    GGML_ASSERT(dst);
 7500    GGML_ASSERT(dst->extra);
 7501    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 7502    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 7503
 7504    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 7505
 7506    const int mode_flags        = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
 7507    const ggml_scale_mode mode  = (ggml_scale_mode) (mode_flags & 0xFF);
 7508    cl_kernel kernel = nullptr;
 7509
 7510    if (mode == GGML_SCALE_MODE_NEAREST) {
 7511        kernel = backend_ctx->kernel_upscale;
 7512        if (kernel == nullptr) {
 7513            GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__);
 7514            return;
 7515        }
 7516    } else if (mode == GGML_SCALE_MODE_BILINEAR) {
 7517        kernel = backend_ctx->kernel_upscale_bilinear;
 7518        if (kernel == nullptr) {
 7519            GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__);
 7520            return;
 7521        }
 7522    } else {
 7523        GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode);
 7524        return;
 7525    }
 7526
 7527    ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
 7528    ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra;
 7529
 7530    cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
 7531    cl_ulong off_dst  = extra_dst->offset  + dst->view_offs;
 7532
 7533    const cl_ulong nb00 = src0->nb[0];
 7534    const cl_ulong nb01 = src0->nb[1];
 7535    const cl_ulong nb02 = src0->nb[2];
 7536    const cl_ulong nb03 = src0->nb[3];
 7537
 7538    const int ne00 = src0->ne[0];
 7539    const int ne01 = src0->ne[1];
 7540    const int ne02 = src0->ne[2];
 7541    const int ne03 = src0->ne[3];
 7542
 7543    const int ne0 = dst->ne[0];
 7544    const int ne1 = dst->ne[1];
 7545    const int ne2 = dst->ne[2];
 7546    const int ne3 = dst->ne[3];
 7547
 7548    float sf0 = (float)ne0 / ne00;
 7549    float sf1 = (float)ne1 / ne01;
 7550    float sf2 = (float)ne2 / ne02;
 7551    float sf3 = (float)ne3 / ne03;
 7552
 7553    float pixel_offset = 0.5f;
 7554
 7555    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra_src0->data_device));
 7556    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &off_src0));
 7557    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra_dst->data_device));
 7558    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_dst));
 7559    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong),  &nb00));
 7560    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &nb01));
 7561    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong),  &nb02));
 7562    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong),  &nb03));
 7563
 7564    if (mode == GGML_SCALE_MODE_NEAREST) {
 7565        CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),       &ne0));
 7566        CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),       &ne1));
 7567        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne2));
 7568        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne3));
 7569        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float),    &sf0));
 7570        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float),    &sf1));
 7571        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float),    &sf2));
 7572        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float),    &sf3));
 7573    } else if (mode == GGML_SCALE_MODE_BILINEAR) {
 7574        if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
 7575            sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
 7576            sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
 7577            pixel_offset = 0.0f;
 7578        }
 7579
 7580        CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),       &ne00));
 7581        CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),       &ne01));
 7582        CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne0));
 7583        CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne1));
 7584        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne2));
 7585        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne3));
 7586        CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float),    &sf0));
 7587        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float),    &sf1));
 7588        CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float),    &sf2));
 7589        CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float),    &sf3));
 7590        CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float),    &pixel_offset));
 7591    }
 7592
 7593
 7594    size_t dst_total_elements = (size_t)ne0 * ne1 * ne2 * ne3;
 7595    if (dst_total_elements == 0) {
 7596        return;
 7597    }
 7598    size_t global_work_size[] = { dst_total_elements, 1, 1 };
 7599    size_t local_work_size_pref = 256;
 7600    size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1};
 7601
 7602    size_t * local_work_size_ptr = local_work_size;
 7603    if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) {
 7604        local_work_size_ptr = nullptr;
 7605    }
 7606
 7607    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
 7608}
 7609
 7610static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 7611    GGML_ASSERT(src0);
 7612    GGML_ASSERT(src0->extra);
 7613    GGML_ASSERT(src1);
 7614    GGML_ASSERT(src1->extra);
 7615    GGML_ASSERT(dst);
 7616    GGML_ASSERT(dst->extra);
 7617    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 7618    GGML_ASSERT(src1->type == GGML_TYPE_F32);
 7619    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 7620
 7621    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 7622
 7623    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 7624    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 7625    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 7626
 7627    cl_ulong offset0 = extra0->offset + src0->view_offs;
 7628    cl_ulong offset1 = extra1->offset + src1->view_offs;
 7629    cl_ulong offsetd  = extrad->offset + dst->view_offs;
 7630
 7631    const int ne00 = src0->ne[0];
 7632    const int ne01 = src0->ne[1];
 7633    const int ne02 = src0->ne[2];
 7634    const int ne03 = src0->ne[3];
 7635
 7636    const cl_ulong nb00 = src0->nb[0];
 7637    const cl_ulong nb01 = src0->nb[1];
 7638    const cl_ulong nb02 = src0->nb[2];
 7639    const cl_ulong nb03 = src0->nb[3];
 7640
 7641    const cl_ulong nb10 = src1->nb[0];
 7642    const cl_ulong nb11 = src1->nb[1];
 7643    const cl_ulong nb12 = src1->nb[2];
 7644    const cl_ulong nb13 = src1->nb[3];
 7645
 7646    const int ne0 = dst->ne[0];
 7647    const int ne1 = dst->ne[1];
 7648    const int ne2 = dst->ne[2];
 7649    const int ne3 = dst->ne[3];
 7650
 7651    const cl_ulong nb0 = dst->nb[0];
 7652    const cl_ulong nb1 = dst->nb[1];
 7653    const cl_ulong nb2 = dst->nb[2];
 7654    const cl_ulong nb3 = dst->nb[3];
 7655
 7656    const cl_int dim = ((const int32_t *) dst->op_params)[0];
 7657    GGML_ASSERT(dim >= 0 && dim <= 3);
 7658
 7659    int nth = MIN(64, ne0);
 7660
 7661    cl_kernel kernel = backend_ctx->kernel_concat_f32;
 7662
 7663    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 7664    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 7665    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 7666    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 7667    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 7668    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 7669    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 7670    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 7671    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 7672    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne03));
 7673    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
 7674    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
 7675    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
 7676    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
 7677    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
 7678    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
 7679    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
 7680    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
 7681    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne0));
 7682    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
 7683    CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
 7684    CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
 7685    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
 7686    CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_int),   &dim));
 7687
 7688    size_t global_work_size[] = {(size_t)ne1*nth, (size_t)ne2, (size_t)ne3};
 7689    size_t local_work_size[] = {(size_t)nth, 1, 1};
 7690
 7691    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 7692}
 7693
 7694static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
 7695    GGML_ASSERT(src0);
 7696    GGML_ASSERT(src0->extra);
 7697    GGML_ASSERT(dst);
 7698    GGML_ASSERT(dst->extra);
 7699    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 7700    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 7701
 7702    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 7703
 7704    if (backend_ctx->kernel_timestep_embedding == nullptr) {
 7705        GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
 7706        return;
 7707    }
 7708
 7709    ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
 7710    ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra;
 7711
 7712    cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
 7713    cl_ulong off_dst  = extra_dst->offset  + dst->view_offs;
 7714
 7715    const int logical_dim = dst->op_params[0];
 7716    const int max_period  = dst->op_params[1];
 7717    const int dst_nb1_bytes = dst->nb[1];
 7718
 7719    cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
 7720
 7721    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra_src0->data_device));
 7722    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &off_src0));
 7723    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra_dst->data_device));
 7724    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_dst));
 7725    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &dst_nb1_bytes));
 7726    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),       &logical_dim));
 7727    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &max_period));
 7728
 7729    size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
 7730
 7731    size_t gws1 = (size_t)src0->ne[0];
 7732
 7733    size_t global_work_size[] = {gws0, gws1, 1};
 7734
 7735    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
 7736}
 7737
 7738static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, const ggml_tensor * k, ggml_tensor * dst) {
 7739    const ggml_tensor * v = dst->src[2];
 7740    const ggml_tensor * mask = dst->src[3];
 7741    const ggml_tensor * sinks = dst->src[4];
 7742    GGML_ASSERT(q->extra);
 7743    GGML_ASSERT(k->extra);
 7744    GGML_ASSERT(v->extra);
 7745    GGML_ASSERT(dst->extra);
 7746    if (mask) {
 7747        GGML_ASSERT(mask->extra);
 7748    }
 7749    if (sinks) {
 7750        GGML_ASSERT(sinks->extra);
 7751    }
 7752
 7753    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 7754
 7755    const int n_q = q->ne[1];
 7756    const int n_kv = k->ne[1];
 7757    const int d_head_q = q->ne[0];
 7758    const int d_head_v = v->ne[0];
 7759    const int n_head = q->ne[2];
 7760    const int n_head_kv = k->ne[2];
 7761    const int n_batch = q->ne[3];
 7762
 7763    cl_kernel kernel = NULL;
 7764
 7765    const bool is_f16 = q->type == GGML_TYPE_F16;
 7766    const bool is_mixed = q->type == GGML_TYPE_F32 && k->type == GGML_TYPE_F16;
 7767    const std::pair<int, int> dk_dv = {d_head_q, d_head_v};
 7768
 7769    if (n_q == 1) {
 7770        if (is_mixed) {
 7771            kernel = backend_ctx->kernels_flash_attn_f32_f16_q1.at(dk_dv);
 7772        } else if (is_f16) {
 7773            kernel = backend_ctx->kernels_flash_attn_f16_q1.at(dk_dv);
 7774        } else {
 7775            kernel = backend_ctx->kernels_flash_attn_f32_q1.at(dk_dv);
 7776        }
 7777    } else {
 7778        if (is_mixed) {
 7779            kernel = backend_ctx->kernels_flash_attn_f32_f16.at(dk_dv);
 7780        } else if (is_f16) {
 7781            kernel = backend_ctx->kernels_flash_attn_f16.at(dk_dv);
 7782        } else {
 7783            kernel = backend_ctx->kernels_flash_attn_f32.at(dk_dv);
 7784        }
 7785    }
 7786    GGML_ASSERT(kernel != NULL);
 7787
 7788    ggml_tensor_extra_cl * extra_q = (ggml_tensor_extra_cl *)q->extra;
 7789    ggml_tensor_extra_cl * extra_k = (ggml_tensor_extra_cl *)k->extra;
 7790    ggml_tensor_extra_cl * extra_v = (ggml_tensor_extra_cl *)v->extra;
 7791    ggml_tensor_extra_cl * extra_o = (ggml_tensor_extra_cl *)dst->extra;
 7792    ggml_tensor_extra_cl * extra_mask = mask ? (ggml_tensor_extra_cl *)mask->extra : NULL;
 7793    ggml_tensor_extra_cl * extra_sinks = sinks ? (ggml_tensor_extra_cl *)sinks->extra : NULL;
 7794
 7795    cl_ulong offset_q = extra_q->offset + q->view_offs;
 7796    cl_ulong offset_k = extra_k->offset + k->view_offs;
 7797    cl_ulong offset_v = extra_v->offset + v->view_offs;
 7798    cl_ulong offset_o = extra_o->offset + dst->view_offs;
 7799    cl_mem   mask_buffer = extra_mask ? extra_mask->data_device : NULL;
 7800    cl_ulong offset_mask = extra_mask ? extra_mask->offset + mask->view_offs : 0;
 7801    cl_mem   sinks_buffer = extra_sinks ? extra_sinks->data_device : NULL;
 7802    cl_ulong offset_sinks = extra_sinks ? extra_sinks->offset + sinks->view_offs : 0;
 7803
 7804    const cl_ulong q_nb1 = q->nb[1], q_nb2 = q->nb[2], q_nb3 = q->nb[3];
 7805    const cl_ulong k_nb1 = k->nb[1], k_nb2 = k->nb[2], k_nb3 = k->nb[3];
 7806    const cl_ulong v_nb1 = v->nb[1], v_nb2 = v->nb[2], v_nb3 = v->nb[3];
 7807    const cl_ulong o_nb1 = dst->nb[1], o_nb2 = dst->nb[2], o_nb3 = dst->nb[3];
 7808    const cl_ulong mask_nb1 = mask ? mask->nb[1] : 0;
 7809    const cl_ulong mask_nb2 = mask ? mask->nb[2] : 0;
 7810    const cl_ulong mask_nb3 = mask ? mask->nb[3] : 0;
 7811    const int mask_ne2 = mask ? mask->ne[2] : 0;
 7812    const int mask_ne3 = mask ? mask->ne[3] : 0;
 7813
 7814    float scale, max_bias, logit_softcap;
 7815    const float * params = (const float *)dst->op_params;
 7816    scale         = params[0];
 7817    max_bias      = params[1];
 7818    logit_softcap = params[2];
 7819
 7820    const int is_causal = (mask == NULL && n_q > 1 && n_q == n_kv);
 7821
 7822    const int n_head_log2_val = n_head > 0 ? 1u << (int)floorf(log2f((float)n_head)) : 0;
 7823    const float n_head_log2_f = n_head_log2_val > 0 ? (float)n_head_log2_val : 1.0f;
 7824    const float m0 = powf(2.0f, -(max_bias) / n_head_log2_f);
 7825    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2_f);
 7826
 7827    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra_q->data_device));
 7828    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset_q));
 7829    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra_k->data_device));
 7830    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset_k));
 7831    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extra_v->data_device));
 7832    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset_v));
 7833    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem),   &extra_o->data_device));
 7834    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offset_o));
 7835    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(float),    &scale));
 7836    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),      &n_q));
 7837    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),     &n_kv));
 7838    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),     &is_causal));
 7839    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),     &n_head));
 7840    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &q_nb1)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &q_nb2)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &q_nb3));
 7841    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &k_nb1)); CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &k_nb2)); CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &k_nb3));
 7842    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &v_nb1)); CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &v_nb2)); CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &v_nb3));
 7843    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &o_nb1)); CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &o_nb2)); CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &o_nb3));
 7844    CL_CHECK(clSetKernelArg(kernel, 25, sizeof(float),    &max_bias));
 7845    CL_CHECK(clSetKernelArg(kernel, 26, sizeof(float),    &m0));
 7846    CL_CHECK(clSetKernelArg(kernel, 27, sizeof(float),    &m1));
 7847    CL_CHECK(clSetKernelArg(kernel, 28, sizeof(int),      &n_head_log2_val));
 7848    CL_CHECK(clSetKernelArg(kernel, 29, sizeof(float),    &logit_softcap));
 7849    CL_CHECK(clSetKernelArg(kernel, 30, sizeof(int),      &n_head_kv));
 7850    CL_CHECK(clSetKernelArg(kernel, 31, sizeof(cl_mem),   &mask_buffer));
 7851    CL_CHECK(clSetKernelArg(kernel, 32, sizeof(cl_ulong), &offset_mask));
 7852    CL_CHECK(clSetKernelArg(kernel, 33, sizeof(cl_ulong), &mask_nb1));
 7853    CL_CHECK(clSetKernelArg(kernel, 34, sizeof(cl_ulong), &mask_nb2));
 7854    CL_CHECK(clSetKernelArg(kernel, 35, sizeof(cl_ulong), &mask_nb3));
 7855    CL_CHECK(clSetKernelArg(kernel, 36, sizeof(int),      &mask_ne2));
 7856    CL_CHECK(clSetKernelArg(kernel, 37, sizeof(int),      &mask_ne3));
 7857    CL_CHECK(clSetKernelArg(kernel, 38, sizeof(cl_mem),   &sinks_buffer));
 7858    CL_CHECK(clSetKernelArg(kernel, 39, sizeof(cl_ulong), &offset_sinks));
 7859
 7860    if (n_q == 1) {
 7861        const size_t wg_size = 64;
 7862        size_t local_work_size[] = { wg_size, 1 };
 7863        size_t global_work_size[] = { wg_size, (size_t)(n_head * n_batch) };
 7864        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
 7865    } else {
 7866        const int block_m = backend_ctx->kernels_flash_attn_bm.at(dk_dv);
 7867        const size_t wg_size = block_m;
 7868        size_t local_work_size[] = { wg_size, 1 };
 7869        size_t global_work_size[] = { (size_t)((n_q + block_m - 1) / block_m) * wg_size, (size_t)(n_head * n_batch) };
 7870        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
 7871    }
 7872}
 7873
 7874static void ggml_cl_mul_mat_f16_f32_tiled(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 7875    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 7876
 7877    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 7878    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 7879    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 7880
 7881    cl_ulong offset0 = extra0->offset + src0->view_offs;
 7882    cl_ulong offset1 = extra1->offset + src1->view_offs;
 7883    cl_ulong offsetd = extrad->offset + dst->view_offs;
 7884
 7885    const int M = src0->ne[1];
 7886    const int N = src1->ne[1];
 7887    const int K = src0->ne[0];
 7888
 7889    cl_kernel kernel = backend_ctx->kernel_mul_mat_f16_f32_tiled;
 7890
 7891    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(int),      &M));
 7892    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(int),      &N));
 7893    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),      &K));
 7894    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem),   &extra0->data_device));
 7895    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offset0));
 7896    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem),   &extra1->data_device));
 7897    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offset1));
 7898    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_mem),   &extrad->data_device));
 7899    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &offsetd));
 7900
 7901    // Tiling parameters. These need to be tuned for optimal performance.
 7902    // They must match the #defines in the kernel mul_mat_f16_f32.cl.
 7903    //
 7904    // OPWM / OPWN: Output tile size per Work-Group. A work-group computes a tile of size OPWM x OPWN.
 7905    // TPWM / TPWN: Threads per Work-group. This is the work-group size.
 7906    // OPTM / OPTN: Output elements per Thread. Each thread computes OPTM x OPTN elements.
 7907    //
 7908    // The following relationships must hold:
 7909    //   OPWM = TPWM * OPTM
 7910    //   OPWN = TPWN * OPTN
 7911    //
 7912    const int OPWM = 64;
 7913    const int OPWN = 64;
 7914    const int TPWM = 16;
 7915    const int TPWN = 8;
 7916
 7917    size_t local_work_size[2] = { TPWM, TPWN };
 7918    size_t global_work_size[2] = {
 7919        (size_t) ((M + OPWM - 1) / OPWM) * TPWM,
 7920        (size_t) ((N + OPWN - 1) / OPWN) * TPWN,
 7921    };
 7922
 7923    backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
 7924}
 7925
 7926static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 7927    GGML_TENSOR_BINARY_OP_LOCALS;
 7928    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 7929
 7930    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 7931    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 7932    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 7933
 7934    cl_ulong offset0 = extra0->offset + src0->view_offs;
 7935    cl_ulong offset1 = extra1->offset + src1->view_offs;
 7936    cl_ulong offsetd = extrad->offset + dst->view_offs;
 7937
 7938    const cl_uint Cout = ne03; const cl_uint Cin = ne02; const cl_uint N = ne13;
 7939    const cl_uint KW = ne00; const cl_uint KH = ne01; const cl_uint W = ne10; const cl_uint H = ne11; const cl_uint OW = ne0; const cl_uint OH = ne1;
 7940
 7941    const cl_uint s0 = dst->op_params[0]; const cl_uint s1 = dst->op_params[1];
 7942    const cl_uint p0 = dst->op_params[2]; const cl_uint p1 = dst->op_params[3];
 7943    const cl_uint d0 = dst->op_params[4]; const cl_uint d1 = dst->op_params[5];
 7944
 7945    const cl_uint cl_nb01 = nb01/ggml_type_size(src0->type); const cl_uint cl_nb02 = nb02/ggml_type_size(src0->type); const cl_uint cl_nb03 = nb03/ggml_type_size(src0->type);
 7946    const cl_uint cl_nb11 = nb11/ggml_type_size(src1->type); const cl_uint cl_nb12 = nb12/ggml_type_size(src1->type); const cl_uint cl_nb13 = nb13/ggml_type_size(src1->type);
 7947    const cl_uint cl_nb1 = nb1/ggml_type_size(dst->type); const cl_uint cl_nb2 = nb2/ggml_type_size(dst->type); const cl_uint cl_nb3 = nb3/ggml_type_size(dst->type);
 7948
 7949    const int64_t NPQ = (int64_t)N * OW * OH;
 7950
 7951    const uint32_t BS_K = 64;
 7952    const uint32_t BS_NPQ = 64;
 7953    const uint32_t BS_CRS = 16;
 7954    const uint32_t VEC_SIZE = 4;
 7955
 7956    const uint32_t TS_K = 4;
 7957    const uint32_t TS_NPQ = 8;
 7958
 7959    const uint32_t WG_K = BS_K / TS_K;
 7960    const uint32_t WG_NPQ = BS_NPQ / TS_NPQ;
 7961
 7962    auto splitWork = [](uint32_t work_size, uint32_t block_size) { return (block_size + work_size - 1) / block_size; };
 7963    const uint32_t NB_K = splitWork(Cout, BS_K);
 7964    const uint32_t NB_NPQ = splitWork(NPQ, BS_NPQ);
 7965
 7966    cl_kernel kernel;
 7967    size_t shmem_size;
 7968
 7969    if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
 7970        kernel = backend_ctx->kernel_conv_2d_f16;
 7971        shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_half) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_half4));
 7972    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
 7973        kernel = backend_ctx->kernel_conv_2d_f32;
 7974        shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_float) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_float4));
 7975    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
 7976        kernel = backend_ctx->kernel_conv_2d_f16_f32;
 7977        shmem_size = (size_t)(BS_K * BS_CRS * sizeof(cl_half) + BS_CRS * (BS_NPQ / VEC_SIZE) * sizeof(cl_float4));
 7978    } else {
 7979        GGML_ASSERT(false && "Unsupported data type combination for conv2d");
 7980    }
 7981
 7982    cl_uint idx = 0;
 7983    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra0->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offset0));
 7984    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extra1->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offset1));
 7985    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_ulong), &offsetd));
 7986    CL_CHECK(clSetKernelArg(kernel, idx++, shmem_size, NULL));
 7987    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &Cout)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &Cin)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &N));
 7988    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &KW)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &KH)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &W)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &H));
 7989    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &OW)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &OH));
 7990    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &s1)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &p0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &p1));
 7991    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &d0)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &d1));
 7992    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb01)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb02)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb03));
 7993    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb11)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb12)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb13));
 7994    CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb1)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb2)); CL_CHECK(clSetKernelArg(kernel, idx++, sizeof(cl_uint), &cl_nb3));
 7995
 7996    size_t global_work_size[] = { (size_t)NB_K * WG_K, (size_t)NB_NPQ * WG_NPQ, 1 };
 7997    size_t local_work_size[] = { (size_t)WG_K, (size_t)WG_NPQ, 1 };
 7998
 7999    backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
 8000}
 8001
 8002static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 8003    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 8004
 8005    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 8006    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 8007    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 8008
 8009    const int  ne00 = src0->ne[0];
 8010    const int  ne01 = src0->ne[1];
 8011    const int  ne02 = src0->ne[2];
 8012
 8013    const cl_ulong nb01 = src0->nb[1];
 8014    const cl_ulong nb02 = src0->nb[2];
 8015
 8016    const int  ne10 = src1->ne[0];
 8017    const int  ne11 = src1->ne[1];
 8018    const int  ne12 = src1->ne[2];
 8019
 8020    const cl_ulong nb10 = src1->nb[0];
 8021
 8022    const int  ne0 = dst->ne[0];
 8023    const int  ne1 = dst->ne[1];
 8024
 8025    GGML_ASSERT(ne00 == ne10);
 8026
 8027    cl_kernel kernel;
 8028    cl_context context = backend_ctx->context;
 8029
 8030    cl_int              status;
 8031    cl_image_format     img_fmt_1d;
 8032    cl_image_desc       img_desc_1d;
 8033    cl_buffer_region    region;
 8034    cl_mem              A_image1d;
 8035    cl_mem              A_sub_buffer;
 8036    cl_mem              B_sub_buffer;
 8037    cl_mem              D_image1d;
 8038    cl_mem              D_sub_buffer;
 8039
 8040    int M = ne01;
 8041    int N = ne1;
 8042    int K = ne00;
 8043
 8044    if (nb01 > nb02) {
 8045        // KQ
 8046        kernel = backend_ctx->kernel_mul_mm_f16_f32_kq;
 8047    } else {
 8048        // KQV
 8049        kernel = backend_ctx->kernel_mul_mm_f16_f32_kqv;
 8050    }
 8051    // create sub-buffer for A
 8052    // <--------------------------------------------> //
 8053    extra0 = src0->view_src ? (ggml_tensor_extra_cl *)src0->view_src->extra : (ggml_tensor_extra_cl *)src0->extra;
 8054
 8055    region.origin = (extra0->offset);
 8056    if (nb01 > nb02) {
 8057        // KQ
 8058        region.size = nb01 * ne01;
 8059    } else {
 8060        // KQV
 8061        region.size = nb02 * ne02;
 8062    }
 8063
 8064    A_sub_buffer = clCreateSubBuffer((extra0->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
 8065    CL_CHECK(status);
 8066
 8067    // <--------------------------------------------> //
 8068
 8069    // create sub-buffer for B
 8070    // <--------------------------------------------> //
 8071    region.origin = (extra1->offset);
 8072    region.size = nb10 * ne10 * ne11 * ne12;
 8073    B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
 8074    CL_CHECK(status);
 8075    // <--------------------------------------------> //
 8076
 8077    img_fmt_1d = {CL_RGBA, CL_FLOAT};
 8078    memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 8079    img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 8080    if (nb01 > nb02) {
 8081        img_desc_1d.image_width = (nb01 * ne01 / 4)/4;
 8082    }
 8083    else {
 8084        img_desc_1d.image_width = (nb02 * ne02 / 4)/4;
 8085    }
 8086    img_desc_1d.buffer = A_sub_buffer;
 8087    A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
 8088    CL_CHECK(status);
 8089
 8090    // create sub-buffer for output C
 8091    // <--------------------------------------------> //
 8092    region.origin = (extrad->offset);
 8093    region.size = ne0 * ne1 * dst->ne[2] * dst->nb[0]; // size of C in bytes
 8094    D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
 8095    CL_CHECK(status);
 8096    // <--------------------------------------------> //
 8097
 8098    // create image for C output
 8099    // <--------------------------------------------> //
 8100    img_fmt_1d = {CL_R, CL_FLOAT};
 8101    memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 8102    img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 8103    img_desc_1d.image_width = ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4;
 8104    img_desc_1d.buffer = D_sub_buffer;
 8105    D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
 8106    CL_CHECK(status);
 8107    // <--------------------------------------------> //
 8108
 8109    int offset_src0 = 0;
 8110    int offset_src1 = 0;
 8111
 8112    // set kernel args
 8113    // <--------------------------------------------> //
 8114    cl_uint k_arg = 0;
 8115    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem), &A_image1d));
 8116    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),    &offset_src0));
 8117    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem), &B_sub_buffer));
 8118    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),    &offset_src1));
 8119    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem), &D_image1d));
 8120    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),    &extrad->offset));
 8121    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),    &M));
 8122    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),    &K));
 8123    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),    &N));
 8124    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),    &ne02));
 8125    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),    &ne12));
 8126    CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),    &nb01));
 8127
 8128    size_t global_work_size[3] = {64, static_cast<size_t>(((M+63)/64)), static_cast<size_t>(((N+31)/32)*ne12)};
 8129    size_t local_work_size[3] = {64, 1, 2};
 8130
 8131    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 8132
 8133    // deallocate sub buffers and images
 8134    // <--------------------------------------------> //
 8135    CL_CHECK(clReleaseMemObject(A_image1d));
 8136    CL_CHECK(clReleaseMemObject(D_image1d));
 8137    CL_CHECK(clReleaseMemObject(A_sub_buffer));
 8138    CL_CHECK(clReleaseMemObject(B_sub_buffer));
 8139    CL_CHECK(clReleaseMemObject(D_sub_buffer));
 8140}
 8141
 8142static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 8143#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 8144    GGML_ASSERT(src0);
 8145    GGML_ASSERT(src0->extra);
 8146    GGML_ASSERT(src1);
 8147    GGML_ASSERT(src1->extra);
 8148    GGML_ASSERT(dst);
 8149    GGML_ASSERT(dst->extra);
 8150
 8151    const enum ggml_type src0t = src0->type;
 8152    const enum ggml_type src1t = src1->type;
 8153
 8154    GGML_ASSERT(src0t == GGML_TYPE_Q8_0);
 8155    GGML_ASSERT(src1t == GGML_TYPE_F32);
 8156
 8157    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 8158
 8159    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 8160    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 8161
 8162    ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
 8163
 8164    GGML_ASSERT(src1->view_offs == 0);
 8165    GGML_ASSERT(dst->view_offs == 0);
 8166
 8167    const int  ne00 = src0->ne[0];
 8168    const int  ne01 = src0->ne[1];
 8169    const int  ne02 = src0->ne[2];
 8170
 8171    const int  ne10 = src1->ne[0];
 8172    const int  ne12 = src1->ne[2];
 8173
 8174    const int  ne0 = dst->ne[0];
 8175    const int  ne1 = dst->ne[1];
 8176
 8177    GGML_ASSERT(ne00 == ne10);
 8178    GGML_ASSERT((ne00 % 32) == 0);
 8179    GGML_ASSERT(ne0 == ne01);
 8180
 8181    cl_context context = backend_ctx->context;
 8182    cl_kernel kernel;
 8183
 8184    // init CL objects
 8185    cl_int              status;
 8186    cl_image_format     img_fmt_1d;
 8187    cl_image_desc       img_desc_1d;
 8188    cl_buffer_region    region;
 8189    cl_mem              A_image1d;
 8190    cl_mem              B_image1d;
 8191    cl_mem              B_sub_buffer;
 8192    cl_mem              S_image1d;
 8193
 8194    cl_mem              D_image1d;
 8195    cl_mem              D_sub_buffer;
 8196
 8197    int M = ne01;
 8198    int N = ne1;
 8199    int K = ne00;
 8200
 8201    // create an image for A
 8202    img_fmt_1d = { CL_R, CL_FLOAT};
 8203    memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 8204    img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 8205    img_desc_1d.image_width = M * K / 4;    // Divide by 4 for char -> float
 8206    img_desc_1d.buffer = extra0_q8_0->q;
 8207    A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
 8208    CL_CHECK(status);
 8209
 8210    // create an image for Scale
 8211    img_fmt_1d = { CL_R, CL_HALF_FLOAT};
 8212    memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 8213    img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 8214    img_desc_1d.image_width = M * K / 32;    // Block size is 32
 8215    img_desc_1d.buffer = extra0_q8_0->d;
 8216    S_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
 8217    CL_CHECK(status);
 8218
 8219    // create a sub_buffer for B
 8220    region.origin = (extra1->offset); // + src1->view_offs);
 8221    region.size = K * N * sizeof(float);
 8222    B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
 8223    CL_CHECK(status);
 8224
 8225    // create an image for B from sub_buffer: RGBA (OCL)
 8226    img_fmt_1d = {CL_RGBA, CL_FLOAT};
 8227    memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 8228    img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 8229    img_desc_1d.image_width = K * N / 4;
 8230    img_desc_1d.buffer = B_sub_buffer;
 8231    B_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
 8232    CL_CHECK(status);
 8233
 8234    // Create subbuffer and image1d_buffer for dst
 8235    region.origin = (extrad->offset); // + dst->view_offs;
 8236    region.size = M * N * sizeof(float);
 8237    D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
 8238    CL_CHECK(status);
 8239
 8240    img_fmt_1d = {CL_R, CL_FLOAT};
 8241    memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 8242    img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 8243    img_desc_1d.image_width = M * N;
 8244    img_desc_1d.buffer = D_sub_buffer;
 8245    D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
 8246    CL_CHECK(status);
 8247
 8248    size_t local_work_size[3] = {1, 1, 1};
 8249    size_t global_work_size[3] = {1, 1, 1};
 8250
 8251    if (N == 1) {
 8252        kernel = backend_ctx->CL_mul_mat_vec_q8_0_f32;
 8253
 8254        int r2 = 1;
 8255        int r3 = 1;
 8256        cl_uint k_arg = 0;
 8257
 8258        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &A_image1d));
 8259        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &extra0_q8_0->d));
 8260        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &B_image1d));
 8261        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_ulong), &extra1->offset));
 8262        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &extrad->data_device));
 8263        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_ulong), &extrad->offset));
 8264        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne00));
 8265        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne01));
 8266        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne02));
 8267        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne10));
 8268        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne12));
 8269        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne0));
 8270        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne1));
 8271        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &r2));
 8272        CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &r3));
 8273
 8274        size_t wavesize = backend_ctx->adreno_wave_size;
 8275        local_work_size[0] = wavesize;
 8276        local_work_size[1] = 4; // reduce factor
 8277        local_work_size[2] = 1;
 8278
 8279        global_work_size[0] = ((M + wavesize - 1) / wavesize) * wavesize;
 8280        global_work_size[1] = 4; // reduce factor
 8281        global_work_size[2] = 1;
 8282    } else {
 8283        cl_ulong offsetd = extrad->offset + dst->view_offs;
 8284        cl_mem              B_image1d_trans = nullptr;
 8285        // for B transpose
 8286        cl_mem B_d = nullptr;
 8287        int padding;
 8288
 8289        //how many extra elements beyond multiple of 8
 8290        int extra_elements = N % 8;
 8291
 8292        //how much padding to add
 8293        padding = 0;
 8294        if (extra_elements > 0){
 8295            padding = 8 - extra_elements;
 8296        }
 8297
 8298        // Specify the starting offset (in bytes)
 8299        region.origin = 0;
 8300        // Specify the size of the sub-buffer (divide by 2 for FP16)
 8301        region.size = K * (N + padding) * sizeof(float)/2;
 8302        backend_ctx->prealloc_act_trans.allocate(context, region.size);
 8303        B_d = clCreateSubBuffer(
 8304            backend_ctx->prealloc_act_trans.buffer,
 8305            0,
 8306            CL_BUFFER_CREATE_TYPE_REGION,
 8307            &region,
 8308            &status);
 8309        CL_CHECK(status);
 8310
 8311        cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
 8312        cl_image_desc image_desc_B_d_output = {
 8313            CL_MEM_OBJECT_IMAGE1D_BUFFER,
 8314            static_cast<size_t>(K * (N + padding)/4),
 8315            0, 0, 0, 0, 0, 0, 0, { B_d }
 8316        };
 8317        B_image1d_trans = clCreateImage(
 8318            context,
 8319            0,
 8320            &image_format_B_d_output,
 8321            &image_desc_B_d_output,
 8322            NULL,
 8323            &status);
 8324        CL_CHECK(status);
 8325
 8326        int height_B = N/4;
 8327        if (height_B == 0) {
 8328            height_B = 1;
 8329        }
 8330        int width_B = K/4;
 8331        int padded_height_B = (N + padding)/4;
 8332
 8333        kernel = backend_ctx->kernel_transpose_32_16;
 8334        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_image1d));
 8335        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d_trans));
 8336        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_B));
 8337        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_B));
 8338        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &padded_height_B));
 8339
 8340        size_t local_size_t[2] = { 1, 16 };
 8341        size_t global_size_t[2] = {
 8342            static_cast<size_t>(width_B),
 8343            static_cast<size_t>(padded_height_B)
 8344        };
 8345
 8346        backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
 8347
 8348        kernel = backend_ctx->kernel_mul_mm_q8_0_f32_8x4;
 8349
 8350        int N_with_padding = N + padding;
 8351
 8352        CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q8_0->q));
 8353        CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q8_0->d));
 8354        CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &B_image1d_trans));
 8355        CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_mem),   &extrad->data_device));
 8356        CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &K));
 8357        CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),      &M));
 8358        CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &N_with_padding));
 8359        CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &N));
 8360        CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &offsetd));
 8361
 8362        global_work_size[0] = (size_t)(N + 7) / 8;
 8363        global_work_size[1] = (size_t)(M + 3) / 4;
 8364        global_work_size[2] = 1;
 8365
 8366        local_work_size[0] = 2;
 8367        local_work_size[1] = 128;
 8368        local_work_size[2] = 1;
 8369    }
 8370
 8371    // enqueue kernel with profiling
 8372    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 8373
 8374    // deallocate sub buffers and images
 8375    CL_CHECK(clReleaseMemObject(A_image1d));
 8376    CL_CHECK(clReleaseMemObject(B_sub_buffer));
 8377    CL_CHECK(clReleaseMemObject(B_image1d));
 8378    CL_CHECK(clReleaseMemObject(S_image1d));
 8379    CL_CHECK(clReleaseMemObject(D_sub_buffer));
 8380    CL_CHECK(clReleaseMemObject(D_image1d));
 8381#else
 8382    GGML_UNUSED(backend);
 8383    GGML_UNUSED(src0);
 8384    GGML_UNUSED(src1);
 8385    GGML_UNUSED(dst);
 8386#endif
 8387}
 8388
 8389static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 8390    GGML_ASSERT(src0);
 8391    GGML_ASSERT(src0->extra);
 8392    GGML_ASSERT(src1);
 8393    GGML_ASSERT(src1->extra);
 8394    GGML_ASSERT(dst);
 8395    GGML_ASSERT(dst->extra);
 8396
 8397    const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
 8398    const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
 8399
 8400    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 8401
 8402    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 8403    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 8404    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 8405
 8406    cl_ulong offset0 = extra0->offset + src0->view_offs;
 8407    cl_ulong offset1 = extra1->offset + src1->view_offs;
 8408    cl_ulong offsetd = extrad->offset + dst->view_offs;
 8409
 8410#ifdef GGML_OPENCL_SOA_Q
 8411    ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
 8412    ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
 8413    ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
 8414    ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra;
 8415#endif
 8416
 8417    const int  ne00 = src0 ? src0->ne[0] : 0;
 8418    const int  ne01 = src0 ? src0->ne[1] : 0;
 8419    const int  ne02 = src0 ? src0->ne[2] : 0;
 8420    const int  ne03 = src0 ? src0->ne[3] : 0;
 8421
 8422    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
 8423    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
 8424    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
 8425    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
 8426
 8427    const int  ne10 = src1 ? src1->ne[0] : 0;
 8428    const int  ne11 = src1 ? src1->ne[1] : 0;
 8429    const int  ne12 = src1 ? src1->ne[2] : 0;
 8430    const int  ne13 = src1 ? src1->ne[3] : 0;
 8431
 8432    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
 8433    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
 8434    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
 8435    const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
 8436
 8437    const int  ne0 = dst ? dst->ne[0] : 0;
 8438    const int  ne1 = dst ? dst->ne[1] : 0;
 8439
 8440    int r2 = ne12/ne02;
 8441    int r3 = ne13/ne03;
 8442
 8443    GGML_ASSERT(ne00 == ne10);
 8444
 8445    int nth0 = 32;
 8446    int nth1 = 1;
 8447    int nrows = 1;
 8448    // The number of values produced by each subgroup
 8449    int ndst = 4;
 8450
 8451    cl_kernel kernel;
 8452
 8453#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 8454    cl_context context = backend_ctx->context;
 8455
 8456    if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
 8457        if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0  &&
 8458            // dst is wrapped with image1d_buffer, the size limit applies, also src0
 8459            (ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4 <= backend_ctx->image_max_buffer_size)) {
 8460            // For KQ
 8461            if (ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
 8462                ((nb01 * ne01 / 4)/4 <= backend_ctx->image_max_buffer_size) &&
 8463                nb00 <= nb02 &&
 8464                nb02 <= nb01 &&
 8465                nb01 <= nb03 &&
 8466                nb10 <= nb12 &&
 8467                nb12 <= nb11 &&
 8468                nb11 <= nb13) {
 8469                ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
 8470                return;
 8471            }
 8472            // For KQV
 8473            if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
 8474                ((nb02 * ne02 / 4)/4 <= backend_ctx->image_max_buffer_size)) {
 8475                ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
 8476                return;
 8477            }
 8478        }
 8479    }
 8480
 8481    if (ne01 && ne1 && use_adreno_kernels(backend_ctx, src0)) {
 8482
 8483    // init CL objects
 8484    // <--------------------------------------------> //
 8485    cl_int              status;
 8486    cl_image_format     img_fmt_1d;
 8487    cl_image_desc       img_desc_1d;
 8488    cl_buffer_region    region;
 8489    cl_mem              A_image1d = nullptr;
 8490    cl_mem              B_image1d = nullptr;
 8491    cl_mem              B_sub_buffer = nullptr;
 8492    cl_mem              C_d = nullptr;
 8493    // for B transpose
 8494    cl_mem B_d = nullptr;
 8495    cl_mem B_d_input_image = nullptr;
 8496    // <--------------------------------------------> //
 8497
 8498    // define matrix dimensions
 8499    // <--------------------------------------------> //
 8500    int M = ne01;
 8501    int N = ne1;
 8502    int K = ne00;
 8503    int padding;
 8504    // <--------------------------------------------> //
 8505
 8506    // q8_0 x fp32
 8507    if (src0t == GGML_TYPE_Q8_0 && src1t == GGML_TYPE_F32 &&
 8508        enable_adreno_trans_weight(backend_ctx, src0)) {
 8509            ggml_cl_mul_mat_q8_0_f32_adreno(backend, src0, src1, dst);
 8510            return;
 8511    }
 8512
 8513    // q4_0 x fp32
 8514    if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) {
 8515        // TODO: remove duplicate definitions of image description + format -- move to top
 8516
 8517        // create an image for A
 8518        // <--------------------------------------------> //
 8519        if (N == 1) {
 8520            img_fmt_1d = { CL_R, CL_UNSIGNED_INT32};
 8521        } else {
 8522            img_fmt_1d = { CL_R, CL_FLOAT};
 8523        }
 8524        memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 8525        img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 8526        img_desc_1d.image_width = M * K / 2 / 4;    // Divide by 4 for char -> float
 8527        img_desc_1d.buffer = extra0_q4_0->q;
 8528        A_image1d = clCreateImage(
 8529            context,
 8530            CL_MEM_READ_ONLY,
 8531            &img_fmt_1d,
 8532            &img_desc_1d,
 8533            NULL,
 8534            &status);
 8535        CL_CHECK(status);
 8536        // <--------------------------------------------> //
 8537
 8538
 8539        // create a sub_buffer for B
 8540        // <--------------------------------------------> //
 8541        region.origin = (extra1->offset);
 8542        region.size = K * N * sizeof(float);
 8543        B_sub_buffer = clCreateSubBuffer(
 8544            extra1->data_device,
 8545            0,
 8546            CL_BUFFER_CREATE_TYPE_REGION,
 8547            &region,
 8548            &status);
 8549        CL_CHECK(status);
 8550        // <--------------------------------------------> //
 8551
 8552        // transpose activation for Skyler's gemm
 8553        if (N != 1) {
 8554            //how many extra elements beyond multiple of 8
 8555            int extra_elements = N % 8;
 8556
 8557            //how much padding to add
 8558            padding = 0;
 8559            if (extra_elements > 0){
 8560                padding = 8 - extra_elements;
 8561            }
 8562
 8563            // Specify the starting offset (in bytes)
 8564            region.origin = 0;
 8565            // Specify the size of the sub-buffer (divide by 2 for FP16)
 8566            region.size = K * (N + padding) * sizeof(float)/2;
 8567            backend_ctx->prealloc_act_trans.allocate(context, region.size);
 8568
 8569            B_d = clCreateSubBuffer(
 8570                backend_ctx->prealloc_act_trans.buffer,
 8571                0,
 8572                CL_BUFFER_CREATE_TYPE_REGION,
 8573                &region,
 8574                &status);
 8575            CL_CHECK(status);
 8576
 8577            cl_image_format image_format_B_d_input = { CL_RGBA, CL_FLOAT };
 8578            cl_image_desc image_desc_B_d_input = {
 8579                CL_MEM_OBJECT_IMAGE1D_BUFFER,
 8580                static_cast<size_t>(K * N / 4),
 8581                0, 0, 0, 0, 0, 0, 0, { B_sub_buffer }
 8582            };
 8583            B_d_input_image = clCreateImage(
 8584                context,
 8585                0,
 8586                &image_format_B_d_input,
 8587                &image_desc_B_d_input,
 8588                NULL,
 8589                &status);
 8590            CL_CHECK(status);
 8591
 8592            cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; //(CL_HALF_FLOAT for FP16)
 8593            cl_image_desc image_desc_B_d_output = {
 8594                CL_MEM_OBJECT_IMAGE1D_BUFFER,
 8595                static_cast<size_t>(K * (N + padding)/4),
 8596                0, 0, 0, 0, 0, 0, 0, { B_d }
 8597            };
 8598            B_image1d = clCreateImage(
 8599                context,
 8600                0,
 8601                &image_format_B_d_output,
 8602                &image_desc_B_d_output,
 8603                NULL,
 8604                &status);
 8605            CL_CHECK(status);
 8606
 8607            int height_B = N/4;
 8608            if (height_B == 0) {
 8609                height_B = 1;
 8610            }
 8611            int width_B = K/4;
 8612            int padded_height_B = (N + padding)/4;
 8613
 8614            kernel = backend_ctx->kernel_transpose_32_16;
 8615            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_d_input_image));
 8616            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d));
 8617            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int),    &height_B));
 8618            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int),    &width_B));
 8619            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &padded_height_B));
 8620
 8621            size_t local_size_t[2] = { 1, 16 };
 8622            //WGS tuning
 8623            if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
 8624                local_size_t[0]=4;
 8625                local_size_t[1]=8;
 8626            } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
 8627                local_size_t[0]=2;
 8628                local_size_t[1]=8;
 8629            } else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
 8630                local_size_t[0]=1;
 8631                local_size_t[1]=8;
 8632            } else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
 8633                local_size_t[0]=2;
 8634                local_size_t[1]=8;
 8635            }
 8636
 8637            size_t global_size_t[2] = {
 8638                static_cast<size_t>(width_B),
 8639                static_cast<size_t>(padded_height_B)
 8640            };
 8641
 8642            backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
 8643        } else {
 8644            // no need to transpose B in other cases
 8645            // create an image for B from sub_buffer
 8646            // <--------------------------------------------> //
 8647            img_fmt_1d = {CL_RGBA, CL_FLOAT};
 8648
 8649            memset(&img_desc_1d, 0, sizeof(img_desc_1d));
 8650            img_desc_1d.image_width = K * N / 4;
 8651            img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
 8652            img_desc_1d.buffer = B_sub_buffer;
 8653            B_image1d = clCreateImage(
 8654                context,
 8655                CL_MEM_READ_ONLY,
 8656                &img_fmt_1d,
 8657                &img_desc_1d,
 8658                NULL,
 8659                &status);
 8660            CL_CHECK(status);
 8661            // <--------------------------------------------> //
 8662        }
 8663
 8664        // choose gemm or gemv kernel
 8665        // <--------------------------------------------> //
 8666        if (N == 1) {
 8667            kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general;
 8668            if (M == 4096 && K == 4096) {
 8669                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096;
 8670            } else if (M == 4096 && K == 11008) {
 8671                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008;
 8672            } else if (M == 11008 && K == 4096) {
 8673                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
 8674            } else if (M == 32000 && K == 4096) {
 8675                kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
 8676            }
 8677        } else {
 8678            kernel = backend_ctx->CL_mul_mat_Ab_Bi_8x4;
 8679        }
 8680        // <--------------------------------------------> //
 8681
 8682        // set kernel args
 8683        // <--------------------------------------------> //
 8684        cl_uint k_arg = 0;
 8685
 8686        if (N == 1) {
 8687            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &A_image1d));
 8688            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &extra0_q4_0->d));
 8689            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &B_image1d));
 8690            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_ulong), &extra1->offset));
 8691            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_mem),   &extrad->data_device));
 8692            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(cl_ulong), &extrad->offset));
 8693            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne00));
 8694            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne01));
 8695            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne02));
 8696            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne10));
 8697            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne12));
 8698            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne0));
 8699            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &ne1));
 8700            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &r2));
 8701            CL_CHECK(clSetKernelArg(kernel,  k_arg++, sizeof(int),      &r3));
 8702        } else {
 8703            region.origin = extrad->offset; // Specify the starting offset (in bytes)
 8704            region.size = M * N * sizeof(float); // Specify the size of the sub-buffer
 8705            C_d = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
 8706            CL_CHECK(status);
 8707
 8708            int padded_N = ne1 + padding;
 8709
 8710            CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); //A_q_dextra0_q4_0->q
 8711            CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); //A_s_d
 8712            CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d)); //B_d
 8713            CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &C_d)); //C_d
 8714            CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),    &ne01)); //M
 8715            CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),    &padded_N)); //N with padding
 8716            CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),    &ne00)); //K
 8717            CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),    &ne1)); //N without padding
 8718        }
 8719        // <--------------------------------------------> //
 8720
 8721        // choose workgroup size
 8722        // <--------------------------------------------> //
 8723        size_t global_work_size[3] = {
 8724            64, static_cast<size_t>((M+63)/64), static_cast<size_t>((N+31)/32)};
 8725        size_t local_work_size[3] = {64, 2, 4};
 8726
 8727        global_work_size[0] = (size_t)(ceil((float)ne1/8));
 8728        global_work_size[1] = (size_t)(ne01/4);
 8729        global_work_size[2] = (size_t)(1);
 8730
 8731        local_work_size[0]  = (size_t)(1); //4x32 for FP32
 8732        local_work_size[1]  = (size_t)(128);
 8733        local_work_size[2]  = (size_t)(1);
 8734
 8735        //WGS tuning
 8736        if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) {
 8737            local_work_size[0] = 1;
 8738            local_work_size[1] = 128;
 8739        } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) {
 8740            local_work_size[0] = 2;
 8741            local_work_size[1] = 64;
 8742        } else if (ne0 == 4096 && ne1 == 128 && ne10 == 11008) {
 8743            local_work_size[0] = 2;
 8744            local_work_size[1] = 64;
 8745        } else if (ne0 == 32000 && ne1 == 128 && ne10 == 4096) {
 8746            local_work_size[0] = 2;
 8747            local_work_size[1] = 64;
 8748        }
 8749
 8750        if (N == 1) {
 8751            size_t wavesize = backend_ctx->adreno_wave_size;
 8752            local_work_size[0] = wavesize; // localsize
 8753            local_work_size[1] = 4; // reduce factor
 8754            local_work_size[2] = 1;
 8755
 8756            global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
 8757            global_work_size[1] = 4; // reduce factor
 8758            global_work_size[2] = 1;
 8759        }
 8760        // <--------------------------------------------> //
 8761
 8762        // enqueue kernel with profiling
 8763        // <--------------------------------------------> //
 8764        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 8765        // <--------------------------------------------> //
 8766
 8767        // deallocate sub buffers and images
 8768        // <--------------------------------------------> //
 8769        CL_CHECK(clReleaseMemObject(A_image1d));
 8770        CL_CHECK(clReleaseMemObject(B_sub_buffer));
 8771        CL_CHECK(clReleaseMemObject(B_image1d));
 8772
 8773        if (N != 1) {
 8774            CL_CHECK(clReleaseMemObject(B_d));
 8775            CL_CHECK(clReleaseMemObject(B_d_input_image));
 8776            CL_CHECK(clReleaseMemObject(C_d));
 8777        }
 8778        // <--------------------------------------------> //
 8779
 8780        return;
 8781    }
 8782    } // if (ne01 && ne1)
 8783#endif // GGML_OPENCL_USE_ADRENO_KERNELS
 8784
 8785    // GEMM using local memory
 8786    // Current BK = 16, so ne00 % 16 == 0
 8787    if (src1t == GGML_TYPE_F32 &&
 8788        ne00 % 16 == 0 &&
 8789        ne11 > 1) {
 8790        switch(src0t) {
 8791            case GGML_TYPE_F32: {
 8792                kernel = backend_ctx->kernel_mul_mm_f32_f32_l4_lm;
 8793                nth0 = 128; // calculated as (BM*BN)/(TM*TN)
 8794
 8795                int batch_stride_a = ne00*ne01;
 8796                int batch_stride_b = ne10*ne11;
 8797                int batch_stride_d = ne0*ne1;
 8798
 8799                cl_mem mem_src0 = extra0->data_device;
 8800                cl_mem mem_src1 = extra1->data_device;
 8801
 8802                cl_ulong nb00_cont = nb00;
 8803                cl_ulong nb01_cont = nb01;
 8804                cl_ulong nb02_cont = nb02;
 8805                cl_ulong nb03_cont = nb03;
 8806
 8807                cl_ulong nb10_cont = nb10;
 8808                cl_ulong nb11_cont = nb11;
 8809                cl_ulong nb12_cont = nb12;
 8810                cl_ulong nb13_cont = nb13;
 8811
 8812                cl_ulong offset0_cont = offset0;
 8813                cl_ulong offset1_cont = offset1;
 8814
 8815                if (!ggml_is_contiguous(src0)) {
 8816                    backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
 8817                    ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
 8818                        nb00_cont, nb01_cont, nb02_cont, nb03_cont);
 8819                    mem_src0 = backend_ctx->prealloc_src0.buffer;
 8820                    offset0_cont = 0;
 8821                }
 8822
 8823                if (!ggml_is_contiguous(src1)) {
 8824                    backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
 8825                    ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
 8826                        nb10_cont, nb11_cont, nb12_cont, nb13_cont);
 8827                    mem_src1 = backend_ctx->prealloc_src1.buffer;
 8828                    offset1_cont = 0;
 8829                }
 8830
 8831                CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &mem_src0));
 8832                CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0_cont));
 8833                CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &mem_src1));
 8834                CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1_cont));
 8835                CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 8836                CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 8837                CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 8838                CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 8839                CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 8840                CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne11));
 8841                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
 8842                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne10)); // stride_a
 8843                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10)); // stride_b
 8844                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne01)); // stride_d
 8845                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &batch_stride_a));
 8846                CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &batch_stride_b));
 8847                CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &batch_stride_d));
 8848                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
 8849                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
 8850
 8851                // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
 8852                size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
 8853                size_t local_work_size[] = {(size_t)nth0, 1, 1};
 8854
 8855                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 8856                return;
 8857            }
 8858            case GGML_TYPE_F16: {
 8859                kernel = backend_ctx->kernel_mul_mm_f16_f32_l4_lm;
 8860                nth0 = 128; // calculated as (BM*BN)/(TM*TN)
 8861
 8862                int batch_stride_a = ne00*ne01;
 8863                int batch_stride_b = ne10*ne11;
 8864                int batch_stride_d = ne0*ne1;
 8865
 8866                cl_mem mem_src0 = extra0->data_device;
 8867                cl_mem mem_src1 = extra1->data_device;
 8868
 8869                cl_ulong nb00_cont = nb00;
 8870                cl_ulong nb01_cont = nb01;
 8871                cl_ulong nb02_cont = nb02;
 8872                cl_ulong nb03_cont = nb03;
 8873
 8874                cl_ulong nb10_cont = nb10;
 8875                cl_ulong nb11_cont = nb11;
 8876                cl_ulong nb12_cont = nb12;
 8877                cl_ulong nb13_cont = nb13;
 8878
 8879                cl_ulong offset0_cont = offset0;
 8880                cl_ulong offset1_cont = offset1;
 8881
 8882                if (!ggml_is_contiguous(src0)) {
 8883                    backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0));
 8884                    ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer,
 8885                        nb00_cont, nb01_cont, nb02_cont, nb03_cont);
 8886                    mem_src0 = backend_ctx->prealloc_src0.buffer;
 8887                    offset0_cont = 0;
 8888                }
 8889
 8890                if (!ggml_is_contiguous(src1)) {
 8891                    backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1));
 8892                    ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer,
 8893                            nb10_cont, nb11_cont, nb12_cont, nb13_cont);
 8894                    mem_src1 = backend_ctx->prealloc_src1.buffer;
 8895                    offset1_cont = 0;
 8896                }
 8897
 8898                CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &mem_src0));
 8899                CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0_cont));
 8900                CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &mem_src1));
 8901                CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1_cont));
 8902                CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 8903                CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 8904                CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 8905                CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 8906                CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 8907                CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne11));
 8908                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
 8909                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne10)); // stride_a
 8910                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10)); // stride_b
 8911                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne01)); // stride_d
 8912                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &batch_stride_a));
 8913                CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &batch_stride_b));
 8914                CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &batch_stride_d));
 8915                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
 8916                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
 8917
 8918                // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
 8919                size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
 8920                size_t local_work_size[] = {(size_t)nth0, 1, 1};
 8921
 8922                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 8923                return;
 8924            }
 8925            case GGML_TYPE_Q8_0: {
 8926                if (ne11 < 32) {
 8927                    break;
 8928                }
 8929                if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
 8930                    break;
 8931                }
 8932
 8933                kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm;
 8934                nth0 = 128; // calculated as (BM*BN)/(TM*TN)
 8935
 8936                int batch_stride_a = ne00*ne01;
 8937                int batch_stride_b = ne10*ne11;
 8938                int batch_stride_d = ne0*ne1;
 8939
 8940                CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q8_0->q));
 8941                CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q8_0->d));
 8942                CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 8943                CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 8944                CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 8945                CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 8946                CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 8947                CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 8948                CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 8949                CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne11));
 8950                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
 8951                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne10)); // stride_a
 8952                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10)); // stride_b
 8953                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne01)); // stride_d
 8954                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &batch_stride_a));
 8955                CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &batch_stride_b));
 8956                CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &batch_stride_d));
 8957                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
 8958                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
 8959
 8960                // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
 8961                size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
 8962                size_t local_work_size[] = {(size_t)nth0, 1, 1};
 8963
 8964                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 8965                return;
 8966            }
 8967            case GGML_TYPE_Q6_K: {
 8968                if (ne11 < 32) {
 8969                    break;
 8970                }
 8971                if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) {
 8972                    break;
 8973                }
 8974
 8975                kernel = backend_ctx->kernel_mul_mm_q6_k_f32_l4_lm;
 8976                nth0 = 128; // calculated as (BM*BN)/(TM*TN)
 8977
 8978                int batch_stride_a = ne00*ne01;
 8979                int batch_stride_b = ne10*ne11;
 8980                int batch_stride_d = ne0*ne1;
 8981
 8982                CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q6_K->ql));
 8983                CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q6_K->qh));
 8984                CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra0_q6_K->s));
 8985                CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_mem),   &extra0_q6_K->d));
 8986                CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extra1->data_device));
 8987                CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset1));
 8988                CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
 8989                CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
 8990                CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
 8991                CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne01));
 8992                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne02));
 8993                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne11));
 8994                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne12));
 8995                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne10)); // stride_a
 8996                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne10)); // stride_b
 8997                CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne01)); // stride_d
 8998                CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &batch_stride_a));
 8999                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &batch_stride_b));
 9000                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &batch_stride_d));
 9001                CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int),      &r2));
 9002                CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),      &r3));
 9003
 9004                // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
 9005                size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
 9006                size_t local_work_size[] = {(size_t)nth0, 1, 1};
 9007
 9008                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 9009                return;
 9010            }
 9011            default:
 9012                break;
 9013        }
 9014    }
 9015
 9016    if (src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32 &&
 9017        src0->ne[1] > 32 &&   // M > 32
 9018        src1->ne[1] > 32 &&   // N > 32
 9019        src0->ne[0] > 32 &&   // K > 32
 9020        src0->ne[2] == 1 && src0->ne[3] == 1 &&
 9021        src1->ne[2] == 1 && src1->ne[3] == 1 &&
 9022        ggml_is_contiguous(src0) && ggml_is_contiguous(src1) &&
 9023        backend_ctx->kernel_mul_mat_f16_f32_tiled != NULL) {
 9024        ggml_cl_mul_mat_f16_f32_tiled(backend, src0, src1, dst);
 9025        return;
 9026    }
 9027
 9028    if (!ggml_is_transposed(src0) &&
 9029        !ggml_is_transposed(src1) &&
 9030        src1t == GGML_TYPE_F32 &&
 9031        ne00%32 == 0 &&
 9032        ne11 > 2) {
 9033#ifdef GGML_OPENCL_SOA_Q
 9034        // Set up kernel.
 9035        switch(src0t) {
 9036            case GGML_TYPE_Q4_0:
 9037                // This should have been satisfied.
 9038                GGML_ASSERT(ne11 == ne1);
 9039                GGML_ASSERT(ne01 == ne0);
 9040
 9041                if (backend_ctx->gpu_family == INTEL) {
 9042                    nth0 = 16;
 9043                    nth1 = 1;
 9044
 9045                    kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat;
 9046                } else if (backend_ctx->gpu_family == ADRENO) {
 9047                    nth0 = 64;
 9048                    nth1 = 1;
 9049
 9050                    kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat;
 9051                } else {
 9052                    GGML_ASSERT(false && "TODO: Unknown GPU");
 9053                }
 9054
 9055                CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
 9056                CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_0->d));
 9057                CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9058                CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9059                CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 9060                CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 9061                CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 9062                CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 9063                CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 9064                CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
 9065                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
 9066                CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
 9067                CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
 9068                CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
 9069                CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
 9070                break;
 9071            default:
 9072                break;
 9073        }
 9074
 9075        // Launch kernel.
 9076        if (src0t == GGML_TYPE_Q4_0) {
 9077            size_t global_work_size[] = {(size_t)(ne01 + 7)/8*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
 9078            size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 9079
 9080            if (backend_ctx->gpu_family == INTEL) {
 9081                // Set global size for Intel. It uses 16x output values.
 9082                global_work_size[0] = (size_t)(ne01 + 15)/16*nth0;
 9083                global_work_size[1] = (size_t)ne11*nth1;
 9084                global_work_size[2] = (size_t)ne12*ne13;
 9085            }
 9086
 9087            backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 9088            return;
 9089        }
 9090#else // GGML_OPENCL_SOA_Q
 9091        // TODO: add block_q4_0 variant.
 9092#endif // GGML_OPENCL_SOA_Q
 9093    }
 9094
 9095    // use custom matrix x vector kernel
 9096    switch (src0t) {
 9097        case GGML_TYPE_F32:
 9098            //GGML_ASSERT(ne02 == ne12);
 9099            GGML_ASSERT(src1t == GGML_TYPE_F32);
 9100            kernel = backend_ctx->kernel_mul_mat_f32_f32;
 9101            nrows = 4;
 9102
 9103            if (backend_ctx->gpu_family == INTEL) {
 9104                nth0 = 32;
 9105                nth1 = 1;
 9106            } else if (backend_ctx->gpu_family == ADRENO) {
 9107                nth0 = 64;
 9108                nth1 = 1;
 9109            } else {
 9110                GGML_ASSERT(false && "TODO: Unknown GPU");
 9111            }
 9112
 9113            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 9114            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 9115            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9116            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9117            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 9118            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 9119            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 9120            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 9121            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 9122            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb00));
 9123            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
 9124            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
 9125            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
 9126            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne10));
 9127            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne11));
 9128            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne12));
 9129            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
 9130            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
 9131            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
 9132            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
 9133            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),      &ne0));
 9134            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),      &ne1));
 9135            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &r2));
 9136            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r3));
 9137            break;
 9138        case GGML_TYPE_F16:
 9139            //GGML_ASSERT(ne02 == ne12);
 9140            if (backend_ctx->gpu_family == INTEL) {
 9141                nth0 = 32;
 9142                nth1 = 1;
 9143            } else if (backend_ctx->gpu_family == ADRENO) {
 9144                nth0 = 64;
 9145                nth1 = 1;
 9146            } else {
 9147                GGML_ASSERT(false && "TODO: Unknown GPU");
 9148            }
 9149
 9150            if (src1t == GGML_TYPE_F32) {
 9151                if (ne11 * ne12 < 4) {
 9152                    kernel = backend_ctx->kernel_mul_mat_f16_f32_1row;
 9153                } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
 9154                    kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
 9155                    nrows = ne11;
 9156                } else {
 9157                    kernel = backend_ctx->kernel_mul_mat_f16_f32;
 9158                    nrows = 4;
 9159                }
 9160            } else {
 9161                kernel = backend_ctx->kernel_mul_mat_f16_f16;
 9162                nrows = 4;
 9163            }
 9164
 9165            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 9166            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 9167            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9168            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9169            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 9170            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 9171            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 9172            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 9173            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 9174            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb00));
 9175            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
 9176            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
 9177            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03));
 9178            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne10));
 9179            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne11));
 9180            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne12));
 9181            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
 9182            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
 9183            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
 9184            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
 9185            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),      &ne0));
 9186            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),      &ne1));
 9187            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &r2));
 9188            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r3));
 9189            break;
 9190        case GGML_TYPE_Q4_0:
 9191            // This should have been satisfied.
 9192            GGML_ASSERT(ne11 == ne1);
 9193            GGML_ASSERT(ne01 == ne0);
 9194
 9195#ifdef GGML_OPENCL_SOA_Q
 9196            if (backend_ctx->gpu_family == INTEL) {
 9197                nth0 = 16;
 9198                nth1 = 1;
 9199
 9200                kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
 9201                ndst = 8;
 9202            } else if (backend_ctx->gpu_family == ADRENO) {
 9203                nth0 = 64;
 9204                nth1 = 1;
 9205
 9206                kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat;
 9207                ndst =8;
 9208            } else {
 9209                GGML_ASSERT(false && "TODO: Unknown GPU");
 9210            }
 9211
 9212            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
 9213            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_0->d));
 9214            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9215            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9216            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 9217            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 9218            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 9219            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 9220            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 9221            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
 9222            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
 9223            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
 9224            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
 9225            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
 9226            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
 9227#else // GGML_OPENCL_SOA_Q
 9228            if (backend_ctx->gpu_family == INTEL) {
 9229                // Use 1D local size. Each workgroup is a SIMD group. Each SIMD
 9230                // group produces N_DST (4 for Q4_0 kernel) values in the result.
 9231                // The number of workgroups on dim 0 (the leading dimension) is
 9232                // the nearest multiple of 4 that covers ne0 (equals ne01).
 9233                nth0 = 16;
 9234                nth1 = 1;
 9235
 9236                kernel = backend_ctx->kernel_mul_mat_q4_0_f32;
 9237                ndst = 4;
 9238            } else if (backend_ctx->gpu_family == ADRENO) {
 9239                nth0 = 64;
 9240                nth1 = 1;
 9241
 9242                kernel = backend_ctx->kernel_mul_mat_q4_0_f32_v;
 9243                ndst = 4;
 9244            } else {
 9245                GGML_ASSERT(false && "TODO: Unknown GPU");
 9246            }
 9247
 9248            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 9249            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 9250            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9251            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9252            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 9253            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 9254            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 9255            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 9256            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 9257            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
 9258            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
 9259            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
 9260            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
 9261            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
 9262            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
 9263#endif // GGML_OPENCL_SOA_Q
 9264            break;
 9265        case GGML_TYPE_Q4_1:
 9266        case GGML_TYPE_Q8_0: {
 9267#ifdef GGML_OPENCL_SOA_Q
 9268            kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat;
 9269
 9270            // nth0 - subgroup size
 9271            // nth1 - number of subgroups per workgroup
 9272            // ndst - number of output values per workgroup = output per subgroup * number of subgroups
 9273            if (backend_ctx->gpu_family == INTEL) {
 9274                nth0 = 16;
 9275                nth1 = 2;
 9276                ndst = nth1*4;
 9277            } else if (backend_ctx->gpu_family == ADRENO) {
 9278                nth0 = 64;
 9279                nth1 = 2;
 9280                ndst = nth1*4;
 9281            } else {
 9282                GGML_ASSERT(false && "TODO: Unknown GPU");
 9283            }
 9284
 9285            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q8_0->q));
 9286            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q8_0->d));
 9287            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9288            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9289            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 9290            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 9291            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 9292            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 9293            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb01));
 9294            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb02));
 9295            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
 9296            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne12));
 9297            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
 9298            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
 9299            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
 9300            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne0));
 9301            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne1));
 9302            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
 9303            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
 9304#else
 9305            kernel = backend_ctx->kernel_mul_mv_q8_0_f32;
 9306
 9307            // nth0 - subgroup size
 9308            // nth1 - number of subgroups per workgroup
 9309            // ndst - number of output values per workgroup = output per subgroup * number of subgroups
 9310            if (backend_ctx->gpu_family == INTEL) {
 9311                nth0 = 16;
 9312                nth1 = 2;
 9313                ndst = nth1*4;
 9314            } else if (backend_ctx->gpu_family == ADRENO) {
 9315                nth0 = 64;
 9316                nth1 = 2;
 9317                ndst = nth1*4;
 9318            } else {
 9319                GGML_ASSERT(false && "TODO: Unknown GPU");
 9320            }
 9321
 9322            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 9323            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 9324            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9325            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9326            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 9327            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 9328            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 9329            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 9330            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb01));
 9331            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb02));
 9332            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
 9333            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne12));
 9334            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
 9335            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
 9336            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
 9337            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne0));
 9338            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne1));
 9339            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
 9340            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
 9341#endif // GGML_OPENCL_SOA_Q
 9342            break;
 9343        }
 9344        case GGML_TYPE_Q2_K:
 9345        case GGML_TYPE_Q3_K:
 9346        case GGML_TYPE_Q4_K: {
 9347            kernel = backend_ctx->kernel_mul_mv_q4_K_f32;
 9348
 9349            if (backend_ctx->gpu_family == INTEL) {
 9350                nth0 = 16;
 9351                nth1 = 1;
 9352                ndst = 4;
 9353            } else if (backend_ctx->gpu_family == ADRENO) {
 9354                nth0 = 64;
 9355                nth1 = 1;
 9356                ndst = 4;
 9357            } else {
 9358                GGML_ASSERT(false && "TODO: Unknown GPU");
 9359            }
 9360
 9361            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),     &extra0->data_device));
 9362            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(int),        &offset0));
 9363            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),     &extra1->data_device));
 9364            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(int),        &offset1));
 9365            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),     &extrad->data_device));
 9366            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),        &offsetd));
 9367            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),        &ne00));
 9368            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),        &ne01));
 9369            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong),   &nb01));
 9370            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong),   &nb02));
 9371            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),   &nb03));
 9372            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),        &ne12));
 9373            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong),   &nb11));
 9374            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong),   &nb12));
 9375            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong),   &nb13));
 9376            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),        &ne0));
 9377            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),        &ne1));
 9378            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),        &r2));
 9379            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),        &r3));
 9380            break;
 9381        }
 9382        case GGML_TYPE_Q5_K:
 9383        case GGML_TYPE_Q6_K:
 9384#ifdef GGML_OPENCL_SOA_Q
 9385            kernel = backend_ctx->kernel_mul_mv_q6_K_f32_flat;
 9386
 9387            if (backend_ctx->gpu_family == INTEL) {
 9388                nth0 = 16;
 9389                nth1 = 2;
 9390                ndst = 4;
 9391            } else if (backend_ctx->gpu_family == ADRENO) {
 9392                nth0 = 64;
 9393                nth1 = 2;
 9394                ndst = 4;
 9395            } else {
 9396                GGML_ASSERT(false && "TODO: Unknown GPU");
 9397            }
 9398
 9399            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q6_K->ql));
 9400            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q6_K->qh));
 9401            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra0_q6_K->s));
 9402            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_mem),   &extra0_q6_K->d));
 9403            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extra1->data_device));
 9404            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset1));
 9405            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
 9406            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
 9407            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
 9408            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne01));
 9409            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne02));
 9410            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne10));
 9411            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne12));
 9412            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne0));
 9413            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne1));
 9414            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &r2));
 9415            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &r3));
 9416#else
 9417            kernel = backend_ctx->kernel_mul_mv_q6_K_f32;
 9418
 9419            if (backend_ctx->gpu_family == INTEL) {
 9420                nth0 = 16;
 9421                nth1 = 2;
 9422                ndst = 1;
 9423            } else if (backend_ctx->gpu_family == ADRENO) {
 9424                nth0 = 64;
 9425                nth1 = 2;
 9426                ndst = 1;
 9427            } else {
 9428                GGML_ASSERT(false && "TODO: Unknown GPU");
 9429            }
 9430
 9431            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 9432            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 9433            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9434            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9435            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 9436            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 9437            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 9438            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
 9439            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
 9440            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne10));
 9441            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
 9442            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne0));
 9443            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne1));
 9444            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &r2));
 9445            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &r3));
 9446#endif // GGML_OPENCL_SOA_Q
 9447            break;
 9448        case GGML_TYPE_MXFP4: {
 9449#ifdef GGML_OPENCL_SOA_Q
 9450            kernel = backend_ctx->kernel_mul_mv_mxfp4_f32_flat;
 9451
 9452            cl_mem q;
 9453            if (backend_ctx->gpu_family == INTEL) {
 9454                nth0 = 16;
 9455                nth1 = 2;
 9456                ndst = nth1*2;
 9457
 9458                q = extra0_mxfp4->q;
 9459            } else if (backend_ctx->gpu_family == ADRENO) {
 9460                nth0 = 64;
 9461                nth1 = 2;
 9462                ndst = nth1*2;
 9463
 9464                q = extra0_mxfp4->q_img;
 9465            } else {
 9466                GGML_ASSERT(false && "TODO: Unknown GPU");
 9467            }
 9468
 9469            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &q));
 9470            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_mxfp4->e));
 9471            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9472            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9473            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 9474            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 9475            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 9476            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
 9477            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
 9478            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb03));
 9479            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
 9480            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
 9481            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb12));
 9482            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb13));
 9483            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne0));
 9484            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne1));
 9485            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &r2));
 9486            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r3));
 9487#else
 9488            kernel = backend_ctx->kernel_mul_mv_mxfp4_f32;
 9489
 9490            if (backend_ctx->gpu_family == INTEL) {
 9491                nth0 = 16;
 9492                nth1 = 2;
 9493                ndst = nth1*2;
 9494            } else if (backend_ctx->gpu_family == ADRENO) {
 9495                nth0 = 64;
 9496                nth1 = 2;
 9497                ndst = nth1*2;
 9498            } else {
 9499                GGML_ASSERT(false && "TODO: Unknown GPU");
 9500            }
 9501
 9502            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 9503            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 9504            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9505            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9506            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
 9507            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
 9508            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne00));
 9509            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
 9510            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
 9511            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb03));
 9512            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
 9513            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
 9514            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb12));
 9515            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb13));
 9516            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne0));
 9517            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne1));
 9518            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &r2));
 9519            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r3));
 9520            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float)*nth0,nullptr));
 9521#endif
 9522            break;
 9523        }
 9524        default:
 9525            GGML_ASSERT(false && "not implemented");
 9526    }
 9527
 9528    if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_MXFP4 ||
 9529        src0t == GGML_TYPE_Q4_1 ||
 9530        src0t == GGML_TYPE_Q8_0 ||
 9531        src0t == GGML_TYPE_Q2_K) {
 9532        // Each SIMD group produces N_DST values in the result. Assuming each
 9533        // workgroup has N_SIMDGROUP SIMD groups, then each workgroup will
 9534        // produce N_DST*N_SIMDGROUP values in the result. Hence, the grid size
 9535        // (number of workgroups) will be a nearest multiple of
 9536        // N_DST*N_SIMDGROUP to cover the size of the dimension. Below, 4 is
 9537        // N_DST*N_SIMDGROUP (see the kernel for Q4_0 matmul).
 9538        size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
 9539        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 9540
 9541        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 9542    } else if (src0t == GGML_TYPE_Q4_K) {
 9543        size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
 9544        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 9545
 9546        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 9547    } else if (src0t == GGML_TYPE_Q3_K) {
 9548        GGML_ASSERT(false && "not implemented");
 9549    } else if (src0t == GGML_TYPE_Q5_K) {
 9550        GGML_ASSERT(false && "not implemented");
 9551    } else if (src0t == GGML_TYPE_Q6_K) {
 9552        size_t global_work_size[] = {(size_t)(ne01+ndst*nth1-1)/(ndst*nth1)*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
 9553        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 9554
 9555        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 9556    } else {
 9557        int64_t ny = (ne11 + nrows - 1)/nrows;
 9558
 9559        size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
 9560        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
 9561
 9562        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 9563    }
 9564}
 9565
 9566static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 9567    GGML_ASSERT(src0);
 9568    GGML_ASSERT(src0->extra);
 9569    GGML_ASSERT(src1);
 9570    GGML_ASSERT(src1->extra);
 9571    GGML_ASSERT(dst);
 9572    GGML_ASSERT(dst->extra);
 9573
 9574    const ggml_tensor * src2 = dst->src[2];
 9575    GGML_ASSERT(src2);
 9576    GGML_ASSERT(src2->extra);
 9577
 9578    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 9579
 9580    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 9581    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
 9582    ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
 9583    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 9584
 9585    cl_ulong offset0 = extra0->offset + src0->view_offs;
 9586    cl_ulong offset1 = extra1->offset + src1->view_offs;
 9587    cl_ulong offset2 = extra2->offset + src2->view_offs;
 9588    cl_ulong offsetd = extrad->offset + dst->view_offs;
 9589
 9590    GGML_UNUSED(offset0);
 9591
 9592#ifdef GGML_OPENCL_SOA_Q
 9593    ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
 9594    ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
 9595    ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
 9596#endif
 9597
 9598    const int ne00 = src0->ne[0];
 9599    const int ne01 = src0->ne[1];
 9600    const int ne02 = src0->ne[2];
 9601    const int ne03 = src0->ne[3];
 9602
 9603    const cl_ulong nb00 = src0->nb[0];
 9604    const cl_ulong nb01 = src0->nb[1];
 9605    const cl_ulong nb02 = src0->nb[2];
 9606    const cl_ulong nb03 = src0->nb[3];
 9607
 9608    const int ne10 = src1->ne[0];
 9609    const int ne11 = src1->ne[1];
 9610    const int ne12 = src1->ne[2];
 9611    const int ne13 = src1->ne[3];
 9612
 9613    const cl_ulong nb11 = src1->nb[1];
 9614    const cl_ulong nb12 = src1->nb[2];
 9615    const cl_ulong nb13 = src1->nb[3];
 9616
 9617    const int ne20 = src2->ne[0];
 9618    const int ne21 = src2->ne[1];
 9619
 9620    const cl_ulong nb21 = src2->nb[1];
 9621    const cl_ulong nb20 = src2->nb[0];
 9622
 9623    UNUSED(nb20);
 9624
 9625    const int ne0 = dst->ne[0];
 9626    const int ne1 = dst->ne[1];
 9627
 9628    const int r2 = ne12/ne02;
 9629    const int r3 = ne13/ne03;
 9630    const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
 9631
 9632    GGML_ASSERT(ne00 == ne10);
 9633
 9634    int sgs   = 32; // subgroup size
 9635    int nsg   = 1;  // number of subgroups
 9636    int nrows = 1;  // number of row in src1
 9637    int ndst  = 4;  // number of values produced by each subgroup
 9638
 9639    cl_kernel kernel;
 9640
 9641    // subgroup mat vec
 9642    switch (src0->type) {
 9643        case GGML_TYPE_Q4_0: {
 9644            kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
 9645
 9646            if (backend_ctx->gpu_family == INTEL) {
 9647                sgs  = 16;
 9648                nsg  = 1;
 9649                ndst = 8;
 9650            } else if (backend_ctx->gpu_family == ADRENO) {
 9651                sgs  = 64;
 9652                nsg  = 1;
 9653                ndst = 8;
 9654            } else {
 9655                GGML_ASSERT(false && "TODO: Unknown GPU");
 9656            }
 9657
 9658            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q4_0->q));
 9659            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q4_0->d));
 9660            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9661            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9662            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extra2->data_device));
 9663            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
 9664            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
 9665            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
 9666            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
 9667            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne01));
 9668            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne02));
 9669            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
 9670            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
 9671            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne10));
 9672            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne11));
 9673            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne12));
 9674            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
 9675            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
 9676            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne20));
 9677            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int),      &ne21));
 9678            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
 9679            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),      &ne0));
 9680            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &ne1));
 9681            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r2));
 9682            CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &r3));
 9683
 9684            break;
 9685        }
 9686        case GGML_TYPE_Q8_0: {
 9687#ifdef GGML_OPENCL_SOA_Q
 9688            kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32_flat;
 9689
 9690            if (backend_ctx->gpu_family == INTEL) {
 9691                sgs  = 16;
 9692                nsg  = 2;
 9693                ndst = 4;
 9694            } else if (backend_ctx->gpu_family == ADRENO) {
 9695                sgs  = 64;
 9696                nsg  = 2;
 9697                ndst = 4;
 9698            } else {
 9699                GGML_ASSERT(false && "TODO: Unknown GPU");
 9700            }
 9701
 9702            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q8_0->q));
 9703            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q8_0->d));
 9704            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9705            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9706            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extra2->data_device));
 9707            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
 9708            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
 9709            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
 9710            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
 9711            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne01));
 9712            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
 9713            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
 9714            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne11));
 9715            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne12));
 9716            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
 9717            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
 9718            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne20));
 9719            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne21));
 9720            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21));
 9721            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int),      &ne0));
 9722            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),      &ne1));
 9723#else
 9724            kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32;
 9725
 9726            if (backend_ctx->gpu_family == INTEL) {
 9727                sgs  = 16;
 9728                nsg  = 2;
 9729                ndst = 4;
 9730            } else if (backend_ctx->gpu_family == ADRENO) {
 9731                sgs  = 64;
 9732                nsg  = 2;
 9733                ndst = 4;
 9734            } else {
 9735                GGML_ASSERT(false && "TODO: Unknown GPU");
 9736            }
 9737
 9738            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 9739            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 9740            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9741            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9742            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extra2->data_device));
 9743            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
 9744            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
 9745            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
 9746            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
 9747            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne01));
 9748            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
 9749            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
 9750            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne11));
 9751            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne12));
 9752            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
 9753            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
 9754            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne20));
 9755            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne21));
 9756            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21));
 9757            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int),      &ne0));
 9758            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),      &ne1));
 9759#endif // GGML_OPENCL_SOA_Q
 9760            break;
 9761        }
 9762        case GGML_TYPE_MXFP4: {
 9763#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
 9764            if (use_adreno_moe_kernels(backend_ctx, src0)) {
 9765                cl_int status;
 9766
 9767                size_t local_size[3] = {64, 2, 1};
 9768                size_t global_size[3] = {64, 2, 1};
 9769
 9770                cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
 9771
 9772                int tile_size = 320;
 9773                if (ne12 == 1) { // for gemv
 9774                    kernel = backend_ctx->kernel_gemv_moe_mxfp4_f32;
 9775
 9776                    // create a sub_buffer for src2
 9777                    cl_buffer_region region;
 9778                    region.origin = offset2;
 9779                    region.size = ne20 * ne21 * sizeof(int);
 9780                    buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
 9781                    CL_CHECK(status);
 9782
 9783                    // set thread grid
 9784                    global_size[0] = static_cast<size_t>(ne01);
 9785                    global_size[1] = 4;
 9786                    global_size[2] = static_cast<size_t>(ne20);
 9787                    local_size[1] = 4;
 9788                } else { // for gemm
 9789                    kernel = backend_ctx->kernel_gemm_moe_mxfp4_f32;
 9790
 9791                    // preprocess router table
 9792                    int num_tiles_per_expert = (ne01 + tile_size - 1) / tile_size;
 9793                    void * host_src2_reorder = malloc(ne20 * ne21 * 4 * num_tiles_per_expert * sizeof(short));
 9794                    void * host_src2 = malloc(ne21 * nb21);
 9795                    CL_CHECK(clEnqueueReadBuffer(backend_ctx->queue, extra2->data_device, CL_TRUE, offset2, ne21 * nb21, host_src2, 0, NULL, NULL));
 9796                    int total_experts = nb21 / nb20;
 9797                    int out_idx = 0;
 9798                    for (int i_expert = 0; i_expert < ne02; i_expert++) {
 9799                        for (int i_tile = 0; i_tile < num_tiles_per_expert; i_tile++) {
 9800                            for (int j = 0; j < ne21; j++) {
 9801                                for (int i = 0; i < ne20; i++) {
 9802                                    int expert = ((int *)host_src2)[j * total_experts + i];
 9803                                    if (i_expert == expert) {
 9804                                        ((short *)host_src2_reorder)[out_idx] = static_cast<short>(expert);
 9805                                        ((short *)host_src2_reorder)[out_idx + 1] = static_cast<short>(j * ne11 + (i % ne11));
 9806                                        ((short *)host_src2_reorder)[out_idx + 2] = static_cast<short>(j * ne20 + i);
 9807                                        ((short *)host_src2_reorder)[out_idx + 3] = static_cast<short>(i_tile);
 9808                                        out_idx += 4;
 9809                                    }
 9810                                }
 9811                            }
 9812                        }
 9813                    }
 9814                    buf_src2 = clCreateBuffer(backend_ctx->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ne20 * ne21 * 4 * num_tiles_per_expert * sizeof(short), host_src2_reorder, &status);
 9815                    CL_CHECK(status);
 9816
 9817                    // set thread grid
 9818                    global_size[0] = static_cast<size_t>(tile_size);
 9819                    global_size[2] = static_cast<size_t>(ne20 * ne21 * num_tiles_per_expert);
 9820                }
 9821
 9822                // create a sub_buffer for src1
 9823                cl_buffer_region region;
 9824                region.origin = offset1;
 9825                region.size = ne10 * ne11 * ne12 * sizeof(float);
 9826                src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, &region, &status);
 9827                CL_CHECK(status);
 9828
 9829                // create image for src1
 9830                cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
 9831                cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
 9832                buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
 9833                CL_CHECK(status);
 9834
 9835                // Set kernel args
 9836                int arg_idx = 0;
 9837                CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_mxfp4->q));
 9838                CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extra0_mxfp4->e));
 9839                CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src1_image));
 9840                CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &buf_src2));
 9841                CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem),    &extrad->data_device));
 9842                CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong),  &offsetd));
 9843                CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne00));
 9844                CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne01));
 9845                if (ne12 == 1) {
 9846                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &ne11));
 9847                } else {
 9848                    CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int),       &tile_size));
 9849                }
 9850
 9851                // launch kernel
 9852                backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
 9853
 9854                // deallocate sub buffers and images
 9855                CL_CHECK(clReleaseMemObject(src1_sub_buffer));
 9856                CL_CHECK(clReleaseMemObject(buf_src1_image));
 9857                CL_CHECK(clReleaseMemObject(buf_src2));
 9858                return;
 9859            } // else fallback to generic kernel
 9860#endif // GGML_OPENCL_USE_ADRENO_KERNELS
 9861
 9862#ifdef GGML_OPENCL_SOA_Q
 9863            kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat;
 9864
 9865            cl_mem q;
 9866            if (backend_ctx->gpu_family == INTEL) {
 9867                sgs  = 16;
 9868                nsg  = 2;
 9869                ndst = 2;
 9870
 9871                q = extra0_mxfp4->q;
 9872            } else if (backend_ctx->gpu_family == ADRENO) {
 9873                sgs  = 64;
 9874                nsg  = 1;
 9875                ndst = 4;
 9876
 9877                q = extra0_mxfp4->q_img;
 9878            } else {
 9879                GGML_ASSERT(false && "TODO: Unknown GPU");
 9880            }
 9881
 9882            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &q));
 9883            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_mxfp4->e));
 9884            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9885            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9886            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extra2->data_device));
 9887            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
 9888            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
 9889            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
 9890            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
 9891            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb01));
 9892            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
 9893            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
 9894            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne11));
 9895            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne12));
 9896            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
 9897            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
 9898            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
 9899            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne20));
 9900            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne21));
 9901            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb21));
 9902            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),      &ne0));
 9903            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),      &ne1));
 9904            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &r2));
 9905            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r3));
 9906#else // GGML_OPENCL_SOA_Q
 9907            kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32;
 9908
 9909            if (backend_ctx->gpu_family == INTEL) {
 9910                sgs  = 16;
 9911                nsg  = 2;
 9912                ndst = 2;
 9913            } else if (backend_ctx->gpu_family == ADRENO) {
 9914                sgs  = 64;
 9915                nsg  = 2;
 9916                ndst = 2;
 9917            } else {
 9918                GGML_ASSERT(false && "TODO: Unknown GPU");
 9919            }
 9920
 9921            CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
 9922            CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
 9923            CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
 9924            CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
 9925            CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extra2->data_device));
 9926            CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
 9927            CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
 9928            CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
 9929            CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
 9930            CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb01));
 9931            CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
 9932            CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
 9933            CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne11));
 9934            CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne12));
 9935            CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
 9936            CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
 9937            CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
 9938            CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne20));
 9939            CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne21));
 9940            CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb21));
 9941            CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int),      &ne0));
 9942            CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int),      &ne1));
 9943            CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int),      &r2));
 9944            CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int),      &r3));
 9945            CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,nullptr));
 9946#endif // GGML_OPENCL_SOA_Q
 9947            break;
 9948        }
 9949        default:
 9950            GGML_ASSERT(false && "not implemented");;
 9951    }
 9952
 9953    int _ne1 = 1;
 9954    int ne123 = dst_rows;
 9955
 9956    size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
 9957    size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
 9958
 9959    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
 9960}
 9961
 9962static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 9963    GGML_ASSERT(src0);
 9964    GGML_ASSERT(src0->extra);
 9965    GGML_ASSERT(dst);
 9966    GGML_ASSERT(dst->extra);
 9967    GGML_UNUSED(src1);
 9968
 9969    GGML_ASSERT(ggml_is_contiguous(src0));
 9970
 9971    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
 9972
 9973    float scale;
 9974    float bias;
 9975    memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(float));
 9976    memcpy(&bias,  ((int32_t *) dst->op_params) + 1, sizeof(float));
 9977
 9978    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
 9979    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
 9980
 9981    cl_ulong offset0 = extra0->offset + src0->view_offs;
 9982    cl_ulong offsetd = extrad->offset + dst->view_offs;
 9983
 9984    cl_kernel kernel;
 9985
 9986    int n = ggml_nelements(dst);
 9987
 9988    if (n % 4 == 0) {
 9989        kernel = backend_ctx->kernel_scale_f32_4;
 9990        n /= 4;
 9991    } else {
 9992        kernel = backend_ctx->kernel_scale_f32;
 9993    }
 9994
 9995    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
 9996    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
 9997    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
 9998    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
 9999    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float),    &scale));
10000    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float),    &bias));
10001
10002    size_t global_work_size[] = {(size_t)n, 1, 1};
10003    size_t local_work_size[] = {64, 1, 1};
10004
10005    size_t * local_work_size_ptr = local_work_size;
10006    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
10007        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
10008    }
10009
10010    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
10011}
10012
10013static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10014    GGML_ASSERT(src0);
10015    GGML_ASSERT(src0->extra);
10016    GGML_ASSERT(src1);
10017    GGML_ASSERT(src1->extra);
10018
10019    // GGML_OP_CPY happens between src0 and src1.
10020    // GGML_OP_DUP and GGML_OP_CONT happen between src0 and dst.
10021    UNUSED(dst);
10022
10023    const int ne00 = src0 ? src0->ne[0] : 0;
10024    const int ne01 = src0 ? src0->ne[1] : 0;
10025    const int ne02 = src0 ? src0->ne[2] : 0;
10026    const int ne03 = src0 ? src0->ne[3] : 0;
10027
10028    const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
10029    const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
10030    const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
10031    const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
10032
10033    const int ne10 = src1 ? src1->ne[0] : 0;
10034    const int ne11 = src1 ? src1->ne[1] : 0;
10035    const int ne12 = src1 ? src1->ne[2] : 0;
10036    const int ne13 = src1 ? src1->ne[3] : 0;
10037
10038    const cl_ulong nb10 = src1 ? src1->nb[0] : 0;
10039    const cl_ulong nb11 = src1 ? src1->nb[1] : 0;
10040    const cl_ulong nb12 = src1 ? src1->nb[2] : 0;
10041    const cl_ulong nb13 = src1 ? src1->nb[3] : 0;
10042
10043    const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
10044    const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
10045
10046    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10047
10048    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10049    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
10050
10051    cl_ulong offset0 = extra0->offset + src0->view_offs;
10052    cl_ulong offset1 = extra1->offset + src1->view_offs;
10053
10054    cl_kernel kernel;
10055
10056    switch (src0t) {
10057        case GGML_TYPE_F32:
10058            switch (src1t) {
10059                case GGML_TYPE_F16:
10060                    kernel = backend_ctx->kernel_cpy_f32_f16;
10061                    break;
10062                case GGML_TYPE_F32:
10063                    kernel = backend_ctx->kernel_cpy_f32_f32;
10064                    break;
10065                default:
10066                    GGML_ASSERT(false && "not implemented");
10067            }
10068            break;
10069        case GGML_TYPE_F16:
10070            switch (src1t) {
10071                case GGML_TYPE_F16:
10072                    kernel = backend_ctx->kernel_cpy_f16_f16;
10073                    break;
10074                case GGML_TYPE_F32:
10075                    kernel = backend_ctx->kernel_cpy_f16_f32;
10076                    break;
10077                default:
10078                    GGML_ASSERT(false && "not implemented");
10079            }
10080            break;
10081        default:
10082            GGML_ASSERT(false && "not implemented");
10083    }
10084
10085    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
10086    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
10087    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
10088    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
10089    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(int),      &ne00));
10090    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(int),      &ne01));
10091    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(int),      &ne02));
10092    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne03));
10093    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb00));
10094    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb01));
10095    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
10096    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
10097    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10));
10098    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne11));
10099    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &ne12));
10100    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &ne13));
10101    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10));
10102    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11));
10103    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12));
10104    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13));
10105
10106    const int nth = MIN(64, ne00);
10107
10108    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
10109    size_t local_work_size[] = {(size_t)nth, 1, 1};
10110
10111    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
10112}
10113
10114static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10115    ggml_cl_cpy(backend, src0, dst, nullptr);
10116    UNUSED(src1);
10117}
10118
10119static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10120    GGML_ASSERT(src0);
10121    GGML_ASSERT(src0->extra);
10122    GGML_ASSERT(dst);
10123    GGML_ASSERT(dst->extra);
10124
10125    UNUSED(src1);
10126
10127    int n_past = ((int32_t *)(dst->op_params))[0];
10128
10129    const int  ne00 = src0 ? src0->ne[0] : 0;
10130    const int  ne01 = src0 ? src0->ne[1] : 0;
10131    const int  ne02 = src0 ? src0->ne[2] : 0;
10132
10133    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10134
10135    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10136    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10137
10138    cl_ulong offset0 = extra0->offset + src0->view_offs;
10139    cl_ulong offsetd = extrad->offset + dst->view_offs;
10140
10141    cl_kernel kernel;
10142
10143    if (ne00%8 == 0) {
10144        kernel = backend_ctx->kernel_diag_mask_inf_8;
10145
10146        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
10147        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10148        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
10149        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
10150        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
10151        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
10152        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &n_past));
10153
10154        size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
10155        size_t local_work_size[] = {64, 1, 1};
10156
10157        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10158    } else {
10159        kernel = backend_ctx->kernel_diag_mask_inf;
10160
10161        CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
10162        CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10163        CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device));
10164        CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
10165        CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00));
10166        CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01));
10167        CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &n_past));
10168
10169        size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
10170        size_t local_work_size[] = {64, 1, 1};
10171
10172        size_t * local_work_size_ptr = local_work_size;
10173        if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
10174            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
10175        }
10176
10177        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
10178    }
10179}
10180
10181static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10182    GGML_ASSERT(src0);
10183    GGML_ASSERT(src0->extra);
10184    GGML_ASSERT(dst);
10185    GGML_ASSERT(dst->extra);
10186
10187    // Softmax can now fuse KQ mask and KQ scale, which used to be two additional
10188    // ops before softmax. It now also fuses alibi if `max_bias > 0`. For llama,
10189    // alibi is not used; however, for some other models, it is used.
10190    // KQ_mask
10191    if (src1) {
10192        GGML_ASSERT(src1);
10193        GGML_ASSERT(src1->extra);
10194    }
10195
10196    const ggml_tensor * src2 = dst->src[2];
10197    if (src2) {
10198        GGML_ASSERT(src2->extra);
10199    }
10200
10201    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10202
10203    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10204    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10205
10206    ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
10207    ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr;
10208
10209    cl_ulong offset0 = extra0->offset + src0->view_offs;
10210    cl_ulong offsetd = extrad->offset + dst->view_offs;
10211
10212    cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
10213    cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
10214
10215    const int ne00 = src0->ne[0];
10216    const int ne01 = src0->ne[1];
10217    const int ne02 = src0->ne[2];
10218    const int ne03 = src0->ne[3];
10219
10220    const cl_long nb01 = src0->nb[1];
10221    const cl_long nb02 = src0->nb[2];
10222    const cl_long nb03 = src0->nb[3];
10223
10224    const int ne12 = src1 ? src1->ne[2] : 0;
10225    const int ne13 = src1 ? src1->ne[3] : 0;
10226
10227    const cl_long nb11 = src1 ? src1->nb[1] : 0;
10228    const cl_long nb12 = src1 ? src1->nb[2] : 0;
10229    const cl_long nb13 = src1 ? src1->nb[3] : 0;
10230
10231    const cl_long nb1 = dst->nb[1];
10232    const cl_long nb2 = dst->nb[2];
10233    const cl_long nb3 = dst->nb[3];
10234
10235    float scale, max_bias;
10236    memcpy(&scale,    dst->op_params + 0, sizeof(float));
10237    memcpy(&max_bias, dst->op_params + 1, sizeof(float));
10238
10239    const int n_head      = src0->ne[2];
10240    const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
10241
10242    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
10243    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
10244
10245    const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
10246
10247    // Local size must be wave size. Each workgroup is a wave, working on a row,
10248    // where a row corresponds to leading dimension.
10249    int nth = MIN(32, ne00);
10250
10251    if (backend_ctx->gpu_family == INTEL) {
10252        // This is the same as the initial value.
10253        nth = MIN(32, ne00);
10254    }
10255    else if (backend_ctx->gpu_family == ADRENO) {
10256        nth = 64;
10257    } else {
10258        GGML_ASSERT(false && "TODO: Unknown GPU");
10259    }
10260
10261    cl_kernel kernel;
10262
10263    if (ne00%4 == 0) {
10264        if (use_f16) {
10265            kernel = backend_ctx->kernel_soft_max_4_f16;
10266        } else {
10267            kernel = backend_ctx->kernel_soft_max_4;
10268        }
10269    } else {
10270        if (use_f16) {
10271            kernel = backend_ctx->kernel_soft_max_f16;
10272        } else {
10273            kernel = backend_ctx->kernel_soft_max;
10274        }
10275    }
10276
10277    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
10278    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
10279    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   extra1 ? &extra1->data_device : &extra0->data_device));
10280    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
10281    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   extra2 ? &extra2->data_device : &extra0->data_device));
10282    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
10283    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
10284    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
10285    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
10286    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb01));
10287    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
10288    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
10289    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne12));
10290    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne13));
10291    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
10292    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
10293    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
10294    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1));
10295    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2));
10296    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3));
10297    CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float),    &scale));
10298    CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float),    &max_bias));
10299    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(float),    &m0));
10300    CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float),    &m1));
10301    CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &n_head_log2));
10302
10303    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
10304    size_t local_work_size[] = {(size_t)nth, 1, 1};
10305
10306    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10307}
10308
10309static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10310    GGML_ASSERT(src0);
10311    GGML_ASSERT(src0->extra);
10312    GGML_ASSERT(src1);
10313    GGML_ASSERT(src1->extra);
10314    GGML_ASSERT(dst);
10315    GGML_ASSERT(dst->extra);
10316
10317    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10318
10319    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10320    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
10321    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10322
10323    cl_ulong offset0 = extra0->offset + src0->view_offs;
10324    cl_ulong offset1 = extra1->offset + src1->view_offs;
10325    cl_ulong offsetd = extrad->offset + dst->view_offs;
10326
10327    ggml_tensor * src2 = dst->src[2];
10328    ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr;
10329
10330    cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
10331
10332    const int  ne00 = src0 ? src0->ne[0] : 0;
10333    const int  ne01 = src0 ? src0->ne[1] : 0;
10334    const int  ne02 = src0 ? src0->ne[2] : 0;
10335    const int  ne03 = src0 ? src0->ne[3] : 0;
10336
10337    const cl_ulong  nb00 = src0 ? src0->nb[0] : 0;
10338    const cl_ulong  nb01 = src0 ? src0->nb[1] : 0;
10339    const cl_ulong  nb02 = src0 ? src0->nb[2] : 0;
10340    const cl_ulong  nb03 = src0 ? src0->nb[3] : 0;
10341
10342    const int ne10 = src1 ? src1->ne[0] : 0;
10343    const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
10344    const int ne12 = src1 ? src1->ne[2] : 0; UNUSED(ne12);
10345    const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
10346
10347    const int  ne0 = dst ? dst->ne[0] : 0;
10348    const int  ne1 = dst ? dst->ne[1] : 0;
10349    const int  ne2 = dst ? dst->ne[2] : 0;
10350    const int  ne3 = dst ? dst->ne[3] : 0;
10351
10352    const cl_ulong  nb0 = dst ? dst->nb[0] : 0;
10353    const cl_ulong  nb1 = dst ? dst->nb[1] : 0;
10354    const cl_ulong  nb2 = dst ? dst->nb[2] : 0;
10355    const cl_ulong  nb3 = dst ? dst->nb[3] : 0;
10356
10357    GGML_ASSERT(ne10 % ne02 == 0);
10358    GGML_ASSERT(ne10 >= ne02);
10359
10360    int nth = MIN(64, ne00);
10361
10362    const int n_past     = ((int *) dst->op_params)[0];
10363    const int n_dims     = ((int *) dst->op_params)[1];
10364    const int mode       = ((int *) dst->op_params)[2];
10365    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
10366
10367    float freq_base;
10368    float freq_scale;
10369    float ext_factor;
10370    float attn_factor;
10371    float beta_fast;
10372    float beta_slow;
10373    int32_t sections[4];
10374
10375    memcpy(&freq_base,   (int32_t *) dst->op_params + 5, sizeof(float));
10376    memcpy(&freq_scale,  (int32_t *) dst->op_params + 6, sizeof(float));
10377    memcpy(&ext_factor,  (int32_t *) dst->op_params + 7, sizeof(float));
10378    memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
10379    memcpy(&beta_fast,   (int32_t *) dst->op_params + 9, sizeof(float));
10380    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
10381    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int32_t)*4);
10382
10383    const bool is_neox = mode & 2;
10384    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
10385    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
10386    const int  is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
10387
10388    if (is_mrope) {
10389        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
10390    }
10391
10392    if (is_vision) {
10393        GGML_ASSERT(n_dims == ne00/2);
10394    }
10395
10396    cl_kernel kernel;
10397
10398    if (is_neox) {
10399        switch (src0->type) {
10400            case GGML_TYPE_F32:
10401                kernel = backend_ctx->kernel_rope_neox_f32;
10402                break;
10403            case GGML_TYPE_F16:
10404                kernel = backend_ctx->kernel_rope_neox_f16;
10405                break;
10406            default:
10407                GGML_ASSERT(false);
10408        };
10409    } else if (is_mrope && !is_vision) {
10410        switch (src0->type) {
10411            case GGML_TYPE_F32:
10412                kernel = backend_ctx->kernel_rope_multi_f32;
10413                break;
10414            case GGML_TYPE_F16:
10415                kernel = backend_ctx->kernel_rope_multi_f16;
10416                break;
10417            default:
10418                GGML_ASSERT(false);
10419        };
10420    } else if (is_vision) {
10421        switch (src0->type) {
10422            case GGML_TYPE_F32:
10423                kernel = backend_ctx->kernel_rope_vision_f32;
10424                break;
10425            case GGML_TYPE_F16:
10426                kernel = backend_ctx->kernel_rope_vision_f16;
10427                break;
10428            default:
10429                GGML_ASSERT(false);
10430        }
10431    } else {
10432        switch (src0->type) {
10433            case GGML_TYPE_F32:
10434                kernel = backend_ctx->kernel_rope_norm_f32;
10435                break;
10436            case GGML_TYPE_F16:
10437                kernel = backend_ctx->kernel_rope_norm_f16;
10438                break;
10439            default:
10440                GGML_ASSERT(false);
10441        };
10442    }
10443
10444    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
10445    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
10446    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
10447    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
10448    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   extra2 ? &extra2->data_device : &extra0->data_device));
10449    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offset2));
10450    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_mem),   &extrad->data_device));
10451    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &offsetd));
10452    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne00));
10453    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne01));
10454    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne02));
10455    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne03));
10456    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb00));
10457    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb01));
10458    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb02));
10459    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb03));
10460    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &ne0));
10461    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &ne1));
10462    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &ne2));
10463    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int),      &ne3));
10464    CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb0));
10465    CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb1));
10466    CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb2));
10467    CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &nb3));
10468    CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int),      &n_past));
10469    CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int),      &n_dims));
10470    CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int),      &n_ctx_orig));
10471    CL_CHECK(clSetKernelArg(kernel, 27, sizeof(float),    &freq_base));
10472    CL_CHECK(clSetKernelArg(kernel, 28, sizeof(float),    &freq_scale));
10473    CL_CHECK(clSetKernelArg(kernel, 29, sizeof(float),    &ext_factor));
10474    CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float),    &attn_factor));
10475    CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float),    &beta_fast));
10476    CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float),    &beta_slow));
10477    // both mrope and vision kernels have sections
10478    if (is_mrope || is_vision) {
10479        CL_CHECK(clSetKernelArg(kernel, 33, sizeof(int32_t)*4, &sections));
10480    }
10481    // only mrope has is_imrope
10482    if (is_mrope && !is_vision) {
10483        CL_CHECK(clSetKernelArg(kernel, 34, sizeof(int), &is_imrope));
10484    }
10485
10486    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
10487    size_t local_work_size[] = {(size_t)nth, 1, 1};
10488
10489    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10490}
10491
10492static void ggml_cl_solve_tri(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10493    GGML_ASSERT(src0);
10494    GGML_ASSERT(src0->extra);
10495    GGML_ASSERT(src1);
10496    GGML_ASSERT(src1->extra);
10497    GGML_ASSERT(dst);
10498    GGML_ASSERT(dst->extra);
10499
10500    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10501
10502    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10503    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
10504    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10505
10506    cl_ulong offset0 = extra0->offset + src0->view_offs;
10507    cl_ulong offset1 = extra1->offset + src1->view_offs;
10508    cl_ulong offsetd = extrad->offset + dst->view_offs;
10509
10510    cl_kernel kernel = backend_ctx->kernel_solve_tri_f32;
10511    GGML_ASSERT(kernel != nullptr);
10512
10513    const int n = src0->ne[0];
10514    const int k = src1->ne[0];
10515
10516    const cl_ulong nb00 = src0->nb[0];
10517    const cl_ulong nb01 = src0->nb[1];
10518    const cl_ulong nb02 = src0->nb[2];
10519    const cl_ulong nb03 = src0->nb[3];
10520
10521    const cl_ulong nb10 = src1->nb[0];
10522    const cl_ulong nb11 = src1->nb[1];
10523    const cl_ulong nb12 = src1->nb[2];
10524    const cl_ulong nb13 = src1->nb[3];
10525
10526    const cl_ulong nb0 = dst->nb[0];
10527    const cl_ulong nb1 = dst->nb[1];
10528    const cl_ulong nb2 = dst->nb[2];
10529    const cl_ulong nb3 = dst->nb[3];
10530
10531    CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device));
10532    CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
10533    CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extra1->data_device));
10534    CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
10535    CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),   &extrad->data_device));
10536    CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
10537    CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &n));
10538    CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &k));
10539    CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
10540    CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
10541    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
10542    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
10543    CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong),&nb10));
10544    CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong),&nb11));
10545    CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong),&nb12));
10546    CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong),&nb13));
10547    CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb0));
10548    CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb1));
10549    CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb2));
10550    CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb3));
10551
10552    size_t global_work_size[3]= { (size_t)k, (size_t)dst->ne[2], (size_t)dst->ne[3]};
10553    size_t local_work_size[] = {16, 4, 1};
10554
10555    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10556}
10557
10558static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10559    GGML_ASSERT(src0);
10560    GGML_ASSERT(src1);
10561    GGML_ASSERT(src1->extra);
10562    GGML_ASSERT(dst);
10563    GGML_ASSERT(dst->extra);
10564
10565    // src0 - filter, src1 - input
10566    GGML_ASSERT(src1->type == GGML_TYPE_F32);
10567    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
10568
10569    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10570
10571    ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
10572    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10573
10574    cl_ulong offset1 = extra1->offset + src1->view_offs;
10575    cl_ulong offsetd = extrad->offset + dst->view_offs;
10576
10577    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
10578    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
10579    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
10580    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
10581    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
10582    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
10583
10584    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
10585
10586    const cl_long IC = src1->ne[is_2D ? 2 : 1];
10587    const cl_long IH = is_2D ? src1->ne[1] : 1;
10588    const cl_long IW =         src1->ne[0];
10589
10590    const cl_long KH = is_2D ? src0->ne[1] : 1;
10591    const cl_long KW =         src0->ne[0];
10592
10593    const cl_long OH = is_2D ? dst->ne[2] : 1;
10594    const cl_long OW =         dst->ne[1];
10595
10596    // nb is byte offset, src is type float32
10597    const cl_ulong delta_offset = src1->nb[is_2D ? 2 : 1]/4;
10598    const cl_long  batch        = src1->ne[is_2D ? 3 : 2];
10599    const cl_ulong batch_offset = src1->nb[is_2D ? 3 : 2]/4;
10600
10601    const cl_long pelements = OW*KW*KH;
10602    const cl_long CHW       = IC*KH*KW;
10603
10604    cl_kernel kernel;
10605
10606    if(dst->type == GGML_TYPE_F16) {
10607        kernel = backend_ctx->kernel_im2col_f16;
10608    } else {
10609        kernel = backend_ctx->kernel_im2col_f32;
10610    }
10611
10612    CL_CHECK(clSetKernelArg(kernel,   0, sizeof(cl_mem),   &extra1->data_device));
10613    CL_CHECK(clSetKernelArg(kernel,   1, sizeof(cl_ulong), &offset1));
10614    CL_CHECK(clSetKernelArg(kernel,   2, sizeof(cl_mem),   &extrad->data_device));
10615    CL_CHECK(clSetKernelArg(kernel,   3, sizeof(cl_ulong), &offsetd));
10616    CL_CHECK(clSetKernelArg(kernel,   4, sizeof(cl_ulong), &batch_offset));
10617    CL_CHECK(clSetKernelArg(kernel,   5, sizeof(cl_ulong), &delta_offset));
10618    CL_CHECK(clSetKernelArg(kernel,   6, sizeof(cl_long),  &IW));
10619    CL_CHECK(clSetKernelArg(kernel,   7, sizeof(cl_long),  &IH));
10620    CL_CHECK(clSetKernelArg(kernel,   8, sizeof(cl_long),  &IC));
10621    CL_CHECK(clSetKernelArg(kernel,   9, sizeof(cl_long),  &OW));
10622    CL_CHECK(clSetKernelArg(kernel,  10, sizeof(cl_long),  &OH));
10623    CL_CHECK(clSetKernelArg(kernel,  11, sizeof(cl_long),  &KW));
10624    CL_CHECK(clSetKernelArg(kernel,  12, sizeof(cl_long),  &KH));
10625    CL_CHECK(clSetKernelArg(kernel,  13, sizeof(cl_long),  &pelements));
10626    CL_CHECK(clSetKernelArg(kernel,  14, sizeof(cl_long),  &CHW));
10627    CL_CHECK(clSetKernelArg(kernel,  15, sizeof(int),      &s0));
10628    CL_CHECK(clSetKernelArg(kernel,  16, sizeof(int),      &s1));
10629    CL_CHECK(clSetKernelArg(kernel,  17, sizeof(int),      &p0));
10630    CL_CHECK(clSetKernelArg(kernel,  18, sizeof(int),      &p1));
10631    CL_CHECK(clSetKernelArg(kernel,  19, sizeof(int),      &d0));
10632    CL_CHECK(clSetKernelArg(kernel,  20, sizeof(int),      &d1));
10633
10634    const int num_blocks = (pelements + 256 - 1) / 256;
10635    size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
10636    size_t local_work_size[] = {256, 1, 1};
10637
10638    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10639}
10640
10641static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10642    GGML_ASSERT(src0);
10643    GGML_ASSERT(src0->extra);
10644    GGML_ASSERT(dst);
10645    GGML_ASSERT(dst->extra);
10646    GGML_UNUSED(src1);
10647
10648    GGML_ASSERT(src0->type == GGML_TYPE_F32);
10649    GGML_ASSERT( dst->type == GGML_TYPE_I32);
10650    GGML_ASSERT(ggml_is_contiguous(src0));
10651
10652    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10653
10654    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10655    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10656
10657    cl_ulong offset0 = extra0->offset + src0->view_offs;
10658    cl_ulong offsetd = extrad->offset + dst->view_offs;
10659
10660    const int ne00  = src0->ne[0];
10661    const int nrows = ggml_nrows(src0);
10662
10663    int ne00_padded = 1;
10664    while (ne00_padded < ne00) {
10665        ne00_padded *= 2;
10666    }
10667
10668    int order = (enum ggml_sort_order) dst->op_params[0];
10669
10670    cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
10671
10672    CL_CHECK(clSetKernelArg(kernel,   0, sizeof(cl_mem),            &extra0->data_device));
10673    CL_CHECK(clSetKernelArg(kernel,   1, sizeof(cl_ulong),          &offset0));
10674    CL_CHECK(clSetKernelArg(kernel,   2, sizeof(cl_mem),            &extrad->data_device));
10675    CL_CHECK(clSetKernelArg(kernel,   3, sizeof(cl_ulong),          &offsetd));
10676    CL_CHECK(clSetKernelArg(kernel,   4, sizeof(int),               &ne00));
10677    CL_CHECK(clSetKernelArg(kernel,   5, sizeof(int),               &ne00_padded));
10678    CL_CHECK(clSetKernelArg(kernel,   6, sizeof(int),               &order));
10679    CL_CHECK(clSetKernelArg(kernel,   7, ne00_padded*sizeof(int),   NULL));
10680
10681    size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
10682    size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
10683
10684    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10685}
10686
10687static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10688    GGML_ASSERT(src0);
10689    GGML_ASSERT(src0->extra);
10690    GGML_ASSERT(dst);
10691    GGML_ASSERT(dst->extra);
10692    GGML_UNUSED(src1);
10693
10694    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
10695    GGML_ASSERT(ggml_is_contiguous(src0));
10696
10697    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10698
10699    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10700    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10701
10702    cl_ulong offset0 = extra0->offset + src0->view_offs;
10703    cl_ulong offsetd = extrad->offset + dst->view_offs;
10704
10705    const int ne00 = src0->ne[0];
10706    const int ne01 = src0->ne[1];
10707    const int ne02 = src0->ne[2];
10708    const int ne03 = src0->ne[3];
10709
10710    const cl_ulong nb01 = src0->nb[1];
10711    const cl_ulong nb02 = src0->nb[2];
10712    const cl_ulong nb03 = src0->nb[3];
10713
10714    const cl_ulong nb1  = dst->nb[1];
10715    const cl_ulong nb2  = dst->nb[2];
10716    const cl_ulong nb3  = dst->nb[3];
10717
10718    cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
10719
10720    CL_CHECK(clSetKernelArg(kernel,   0, sizeof(cl_mem),   &extra0->data_device));
10721    CL_CHECK(clSetKernelArg(kernel,   1, sizeof(cl_ulong), &offset0));
10722    CL_CHECK(clSetKernelArg(kernel,   2, sizeof(cl_mem),   &extrad->data_device));
10723    CL_CHECK(clSetKernelArg(kernel,   3, sizeof(cl_ulong), &offsetd));
10724    CL_CHECK(clSetKernelArg(kernel,   4, sizeof(int),      &ne00));
10725    CL_CHECK(clSetKernelArg(kernel,   5, sizeof(int),      &ne01));
10726    CL_CHECK(clSetKernelArg(kernel,   6, sizeof(int),      &ne02));
10727    CL_CHECK(clSetKernelArg(kernel,   7, sizeof(int),      &ne03));
10728    CL_CHECK(clSetKernelArg(kernel,   8, sizeof(cl_ulong), &nb01));
10729    CL_CHECK(clSetKernelArg(kernel,   9, sizeof(cl_ulong), &nb02));
10730    CL_CHECK(clSetKernelArg(kernel,  10, sizeof(cl_ulong), &nb03));
10731    CL_CHECK(clSetKernelArg(kernel,  11, sizeof(cl_ulong), &nb1));
10732    CL_CHECK(clSetKernelArg(kernel,  12, sizeof(cl_ulong), &nb2));
10733    CL_CHECK(clSetKernelArg(kernel,  13, sizeof(cl_ulong), &nb3));
10734
10735    size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
10736    size_t local_work_size[] = {(size_t)64, 1, 1};
10737
10738    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10739}
10740
10741static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
10742    GGML_ASSERT(src0);
10743    GGML_ASSERT(src0->extra);
10744    GGML_ASSERT(dst);
10745    GGML_ASSERT(dst->extra);
10746
10747    GGML_ASSERT(ggml_is_contiguous_1(src0));
10748
10749    if (src1) {
10750        GGML_ASSERT(src1);
10751        GGML_ASSERT(src1->extra);
10752        GGML_ASSERT(ggml_are_same_shape(src0, src1));
10753    }
10754
10755    ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
10756
10757    cl_kernel kernel;
10758    switch (ggml_get_glu_op(dst)) {
10759        case GGML_GLU_OP_GEGLU:
10760            if (dst->type == GGML_TYPE_F32) {
10761                kernel = backend_ctx->kernel_geglu;
10762            } else {
10763                kernel = backend_ctx->kernel_geglu_f16;
10764            }
10765            break;
10766        case GGML_GLU_OP_REGLU:
10767            if (dst->type == GGML_TYPE_F32) {
10768                kernel = backend_ctx->kernel_reglu;
10769            } else {
10770                kernel = backend_ctx->kernel_reglu_f16;
10771            }
10772            break;
10773        case GGML_GLU_OP_SWIGLU:
10774            if (dst->type == GGML_TYPE_F32) {
10775                kernel = backend_ctx->kernel_swiglu;
10776            } else {
10777                kernel = backend_ctx->kernel_swiglu_f16;
10778            }
10779            break;
10780        case GGML_GLU_OP_SWIGLU_OAI:
10781            kernel = backend_ctx->kernel_swiglu_oai;
10782            break;
10783        case GGML_GLU_OP_GEGLU_ERF:
10784            if (dst->type == GGML_TYPE_F32) {
10785                kernel = backend_ctx->kernel_geglu_erf;
10786            } else {
10787                kernel = backend_ctx->kernel_geglu_erf_f16;
10788            }
10789            break;
10790        case GGML_GLU_OP_GEGLU_QUICK:
10791            if (dst->type == GGML_TYPE_F32) {
10792                kernel = backend_ctx->kernel_geglu_quick;
10793            } else {
10794                kernel = backend_ctx->kernel_geglu_quick_f16;
10795            }
10796            break;
10797        default:
10798            GGML_ABORT("Unsupported glu op");
10799    }
10800
10801    ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
10802    ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
10803
10804    ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
10805
10806    cl_ulong offset0 = extra0->offset + src0->view_offs;
10807    cl_ulong offsetd = extrad->offset + dst->view_offs;
10808
10809    cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
10810
10811    const int ne0       = dst->ne[0];
10812
10813    const cl_ulong nb01 = src0->nb[1];
10814    const cl_ulong nb11 = src1 ? src1->nb[1] : nb01;
10815
10816    const cl_ulong nb1  = dst->nb[1];
10817
10818    const int   swp   = ggml_get_op_params_i32(dst, 1);
10819    const float alpha = ggml_get_op_params_f32(dst, 2);
10820    const float limit = ggml_get_op_params_f32(dst, 3);
10821
10822    const int ne00_off = src1 ? 0 : (swp ? ne0 : 0);
10823    const int ne10_off = src1 ? 0 : (swp ? 0 : ne0);
10824
10825    CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
10826    CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
10827    CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   src1 ? &extra1->data_device : &extra0->data_device));
10828    CL_CHECK(clSetKernelArg(kernel,  3, sizeof(cl_ulong), &offset1));
10829    CL_CHECK(clSetKernelArg(kernel,  4, sizeof(cl_mem),   &extrad->data_device));
10830    CL_CHECK(clSetKernelArg(kernel,  5, sizeof(cl_ulong), &offsetd));
10831    CL_CHECK(clSetKernelArg(kernel,  6, sizeof(cl_ulong), &nb01));
10832    CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb11));
10833    CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne0));
10834    CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb1));
10835    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne00_off));
10836    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne10_off));
10837
10838    if (ggml_get_glu_op(dst) == GGML_GLU_OP_SWIGLU_OAI) {
10839        CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &limit));
10840        CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &alpha));
10841    }
10842
10843    const size_t nrows = ggml_nrows(src0);
10844    size_t nth = 512;
10845    size_t global_work_size[] = {nrows*nth, 1, 1};
10846    size_t local_work_size[] = {nth, 1, 1};
10847
10848    backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
10849}
10850
10851//------------------------------------------------------------------------------
10852// Op offloading
10853//------------------------------------------------------------------------------
10854
10855typedef void (*ggml_cl_func_t)(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
10856
10857bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) {
10858    ggml_cl_func_t func = nullptr;
10859
10860    ggml_tensor * src0 = tensor->src[0];
10861    ggml_tensor * src1 = tensor->src[1];
10862
10863    const bool any_on_device = tensor->extra
10864        || (src0 != nullptr && src0->extra)
10865        || (src1 != nullptr && src1->extra);
10866
10867    switch (tensor->op) {
10868        case GGML_OP_GET_ROWS:
10869            if (!any_on_device) {
10870                return false;
10871            }
10872            func = ggml_cl_get_rows;
10873            break;
10874        case GGML_OP_SET_ROWS:
10875            if (!any_on_device) {
10876                return false;
10877            }
10878            func = ggml_cl_set_rows;
10879            break;
10880        case GGML_OP_CPY:
10881            if (!any_on_device) {
10882                return false;
10883            }
10884            func = ggml_cl_cpy;
10885            break;
10886        case GGML_OP_DUP:
10887        case GGML_OP_CONT:
10888            if (!any_on_device) {
10889                return false;
10890            }
10891            func = ggml_cl_dup;
10892            break;
10893        case GGML_OP_ADD:
10894            if (!any_on_device) {
10895                return false;
10896            }
10897            func = ggml_cl_add;
10898            break;
10899        case GGML_OP_ADD_ID:
10900            if (!any_on_device) {
10901                return false;
10902            }
10903            func = ggml_cl_add_id;
10904            break;
10905        case GGML_OP_MUL:
10906            if (!any_on_device) {
10907                return false;
10908            }
10909            func = ggml_cl_mul;
10910            break;
10911        case GGML_OP_DIV:
10912            if (!any_on_device) {
10913                return false;
10914            }
10915            func = ggml_cl_div;
10916            break;
10917        case GGML_OP_SUB:
10918            if (!any_on_device) {
10919                return false;
10920            }
10921            func = ggml_cl_sub;
10922            break;
10923        case GGML_OP_SQR:
10924            if (!any_on_device) {
10925                return false;
10926            }
10927            func = ggml_cl_sqr;
10928            break;
10929        case GGML_OP_SQRT:
10930            if (!any_on_device) {
10931                return false;
10932            }
10933            func = ggml_cl_sqrt;
10934            break;
10935        case GGML_OP_MEAN:
10936            if (!any_on_device) {
10937                return false;
10938            }
10939            func = ggml_cl_mean;
10940            break;
10941        case GGML_OP_UNARY:
10942            switch (ggml_get_unary_op(tensor)) {
10943                case GGML_UNARY_OP_GELU:
10944                    if (!any_on_device) {
10945                        return false;
10946                    }
10947                    func = ggml_cl_gelu;
10948                    break;
10949                case GGML_UNARY_OP_GELU_ERF:
10950                    if (!any_on_device) {
10951                        return false;
10952                    }
10953                    func = ggml_cl_gelu_erf;
10954                    break;
10955                case GGML_UNARY_OP_GELU_QUICK:
10956                    if (!any_on_device) {
10957                        return false;
10958                    }
10959                    func = ggml_cl_gelu_quick;
10960                    break;
10961                case GGML_UNARY_OP_SILU:
10962                    if (!any_on_device) {
10963                        return false;
10964                    }
10965                    func = ggml_cl_silu;
10966                    break;
10967                case GGML_UNARY_OP_RELU:
10968                    if (!any_on_device) {
10969                        return false;
10970                    }
10971                    func = ggml_cl_relu;
10972                    break;
10973                case GGML_UNARY_OP_SIGMOID:
10974                    if (!any_on_device) {
10975                        return false;
10976                    }
10977                    func = ggml_cl_sigmoid;
10978                    break;
10979                case GGML_UNARY_OP_TANH:
10980                    if (!any_on_device) {
10981                        return false;
10982                    }
10983                    func = ggml_cl_tanh;
10984                    break;
10985                case GGML_UNARY_OP_EXPM1:
10986                    if (!any_on_device) {
10987                        return false;
10988                    }
10989                    func = ggml_cl_expm1;
10990                    break;
10991                case GGML_UNARY_OP_SOFTPLUS:
10992                    if (!any_on_device) {
10993                        return false;
10994                    }
10995                    func = ggml_cl_softplus;
10996                    break;
10997                default:
10998                    return false;
10999            } break;
11000        case GGML_OP_GLU:
11001            if (!any_on_device) {
11002                return false;
11003            }
11004            func = ggml_cl_glu;
11005            break;
11006        case GGML_OP_TRI:
11007            if (!any_on_device) {
11008                return false;
11009            }
11010            func = ggml_cl_tri;
11011            break;
11012        case GGML_OP_FILL:
11013            if (!any_on_device) {
11014                return false;
11015            }
11016            func = ggml_cl_fill;
11017            break;
11018        case GGML_OP_CLAMP:
11019            if (!any_on_device) {
11020                return false;
11021            }
11022            func = ggml_cl_clamp;
11023            break;
11024        case GGML_OP_NORM:
11025            if (!any_on_device) {
11026                return false;
11027            }
11028            func = ggml_cl_norm;
11029            break;
11030        case GGML_OP_RMS_NORM:
11031            if (!any_on_device) {
11032                return false;
11033            }
11034            func = ggml_cl_rms_norm;
11035            break;
11036        case GGML_OP_GROUP_NORM:
11037            if (!any_on_device) {
11038                return false;
11039            }
11040            func = ggml_cl_group_norm;
11041            break;
11042                case GGML_OP_REPEAT:
11043             if (!any_on_device) {
11044                return false;
11045            }
11046            func = ggml_cl_repeat;
11047            break;
11048        case GGML_OP_PAD:
11049            if (!any_on_device) {
11050                return false;
11051            }
11052            ggml_cl_pad(backend, tensor->src[0], tensor);
11053            return true;
11054        case GGML_OP_UPSCALE:
11055            if (!any_on_device) {
11056                return false;
11057            }
11058            ggml_cl_upscale(backend, tensor->src[0], tensor);
11059            return true;
11060        case GGML_OP_CONV_2D:
11061            if (!any_on_device) {
11062                return false;
11063            }
11064            func = ggml_cl_conv_2d;
11065            break;
11066        case GGML_OP_SSM_CONV:
11067            if (!any_on_device) {
11068                return false;
11069            }
11070            func = ggml_cl_ssm_conv;
11071            break;
11072        case GGML_OP_CONCAT:
11073            if (!any_on_device) {
11074                return false;
11075            }
11076            func = ggml_cl_concat;
11077            break;
11078        case GGML_OP_TIMESTEP_EMBEDDING:
11079            if (!any_on_device) {
11080                return false;
11081            }
11082            ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
11083            return true;
11084        case GGML_OP_MUL_MAT:
11085            if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
11086                return false;
11087            }
11088            func = ggml_cl_mul_mat;
11089            break;
11090        case GGML_OP_MUL_MAT_ID:
11091            if (!any_on_device) {
11092                return false;
11093            }
11094            func = ggml_cl_mul_mat_id;
11095            break;
11096        case GGML_OP_SCALE:
11097            if (!any_on_device) {
11098                return false;
11099            }
11100            func = ggml_cl_scale;
11101            break;
11102        case GGML_OP_RESHAPE:
11103        case GGML_OP_VIEW:
11104        case GGML_OP_PERMUTE:
11105        case GGML_OP_TRANSPOSE:
11106            if (!any_on_device) {
11107                return false;
11108            }
11109            func = ggml_cl_nop;
11110            break;
11111        case GGML_OP_DIAG_MASK_INF:
11112            if (!any_on_device) {
11113                return false;
11114            }
11115            func = ggml_cl_diag_mask_inf;
11116            break;
11117        case GGML_OP_SOFT_MAX:
11118            if (!any_on_device) {
11119                return false;
11120            }
11121            func = ggml_cl_soft_max;
11122            break;
11123        case GGML_OP_ROPE:
11124            if (!any_on_device) {
11125                return false;
11126            }
11127            func = ggml_cl_rope;
11128            break;
11129        case GGML_OP_SOLVE_TRI:
11130            if (!any_on_device) {
11131                return false;
11132            }
11133            func = ggml_cl_solve_tri;
11134            break;
11135        case GGML_OP_IM2COL:
11136            if (!any_on_device) {
11137                return false;
11138            }
11139            func = ggml_cl_im2col;
11140            break;
11141        case GGML_OP_ARGSORT:
11142            if (!any_on_device) {
11143                return false;
11144            }
11145            func = ggml_cl_argsort;
11146            break;
11147        case GGML_OP_SUM_ROWS:
11148            if (!any_on_device) {
11149                return false;
11150            }
11151            func = ggml_cl_sum_rows;
11152            break;
11153        case GGML_OP_FLASH_ATTN_EXT:
11154            if (!any_on_device) {
11155                return false;
11156            }
11157            ggml_cl_flash_attn(backend, tensor->src[0], tensor->src[1], tensor);
11158            return true;
11159        default:
11160            return false;
11161    }
11162
11163    func(backend, tensor->src[0], tensor->src[1], tensor);
11164    return true;
11165}