llmnpc - llama.cpp/ggml/src/ggml-sycl/element

Path: llmnpc / llama.cpp / ggml / src / ggml-sycl / element_wise.cpp (raw)
   1#include "common.hpp"
   2#include "ggml-sycl/presets.hpp"
   3#include "ggml.h"
   4#include "element_wise.hpp"
   5
   6#define SYCL_GLOBAL_ID_LOOP(K, ITEM) \
   7    for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0))
   8
   9#define SYCL_LOCAL_ID_CALC(ITEM, IDX) \
  10    (ITEM.get_local_range(IDX) * ITEM.get_group(IDX) + ITEM.get_local_id(IDX))
  11
  12
  13static void acc_f32(const float * x, const float * y, float * dst, const int ne,
  14    const int ne10, const int ne11, const int ne12,
  15    const int nb1, const int nb2, int offset, const sycl::nd_item<1> &item_ct1) {
  16    const int i = SYCL_LOCAL_ID_CALC(item_ct1, 0);
  17    if (i >= ne) {
  18        return;
  19    }
  20    int src1_idx = i - offset;
  21    int oz = src1_idx / nb2;
  22    int oy = (src1_idx - (oz * nb2)) / nb1;
  23    int ox = src1_idx % nb1;
  24    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
  25        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
  26    } else {
  27        dst[i] = x[i];
  28    }
  29}
  30
  31/* Unary OP funcs */
  32template<typename T>
  33static __dpct_inline__ T op_sgn(T x) {
  34    return x > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
  35}
  36
  37template<typename T>
  38static __dpct_inline__ T op_abs(T x) {
  39    return sycl::fabs(x);
  40}
  41
  42template<typename T>
  43static __dpct_inline__ T op_elu(T x) {
  44    return (x > static_cast<T>(0.f)) ? x : sycl::expm1(x);
  45}
  46
  47template<typename T>
  48static __dpct_inline__ T op_gelu(T x) {
  49    const T GELU_COEF_A    = static_cast<T>(0.044715f);
  50    const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
  51    return static_cast<T>(0.5f) * x *
  52           (static_cast<T>(1.0f) +
  53            sycl::tanh(SQRT_2_OVER_PI * x * (static_cast<T>(1.0f) + GELU_COEF_A * x * x)));
  54}
  55
  56template<typename T>
  57static __dpct_inline__ T op_silu(T x) {
  58    return x / (static_cast<T>(1.0f) + sycl::native::exp(-x));
  59}
  60
  61template<typename T>
  62static __dpct_inline__ T op_gelu_quick(T x) {
  63    const T GELU_QUICK_COEF_LOCAL = static_cast<T>(-1.702f);
  64    return x * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x)));
  65}
  66
  67template<typename T>
  68static __dpct_inline__ T op_gelu_erf(T x) {
  69    const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f);
  70    return static_cast<T>(0.5f) * x * (static_cast<T>(1.0f) + sycl::erf(x * SQRT_2_INV));
  71}
  72
  73template<typename T>
  74static __dpct_inline__ T op_tanh(T x) {
  75    return sycl::tanh(x);
  76}
  77
  78template<typename T>
  79static __dpct_inline__ T op_relu(T x) {
  80    return sycl::fmax(x, static_cast<T>(0));
  81}
  82
  83template<typename T>
  84static __dpct_inline__ T op_sigmoid(T x) {
  85    return static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(-x));
  86}
  87
  88template<typename T>
  89static __dpct_inline__ T op_sqrt(T x) {
  90    return sycl::sqrt(x);
  91}
  92
  93template<typename T>
  94static __dpct_inline__ T op_sin(T x) {
  95    return sycl::sin(x);
  96}
  97
  98template<typename T>
  99static __dpct_inline__ T op_cos(T x) {
 100    return sycl::cos(x);
 101}
 102
 103template<typename T>
 104static __dpct_inline__ T op_hardsigmoid(T x) {
 105    return sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
 106}
 107
 108template<typename T>
 109static __dpct_inline__ T op_hardswish(T x) {
 110    return x * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
 111}
 112
 113template<typename T>
 114static __dpct_inline__ T op_exp(T x) {
 115    return sycl::exp(x);
 116}
 117
 118template<typename T>
 119static __dpct_inline__ T op_log(T x) {
 120    if (x <= static_cast<T>(0)) {
 121        return neg_infinity<T>();
 122    }
 123    return sycl::log(x);
 124}
 125
 126template<typename T>
 127static __dpct_inline__ T op_softplus(T x) {
 128    const float xf = (float) x;
 129    const float ax = sycl::fabs(xf);
 130    const float m  = sycl::fmax(xf, 0.0f);
 131    const float y  = m + sycl::log1p(sycl::exp(-ax));
 132    return (T) y;
 133}
 134
 135template<typename T>
 136static __dpct_inline__ T op_neg(T x) {
 137    return -x;
 138}
 139
 140template<typename T>
 141static __dpct_inline__ T op_step(T x) {
 142    return (x > static_cast<T>(0.0f)) ? static_cast<T>(1.0f) : static_cast<T>(0.0f);
 143}
 144
 145template<typename T>
 146static __dpct_inline__ T op_leaky_relu(T x, float negative_slope) {
 147    T neg_slope_T = static_cast<T>(negative_slope);
 148    return sycl::fmax(x, static_cast<T>(0)) +
 149           sycl::fmin(x, static_cast<T>(0.0f)) * neg_slope_T;
 150}
 151
 152template<typename T>
 153static __dpct_inline__ T op_sqr(T x) {
 154    return x * x;
 155}
 156
 157template<typename T>
 158static __dpct_inline__ T op_clamp(T x, float min_val, float max_val) {
 159    return x < static_cast<T>(min_val) ? static_cast<T>(min_val) : (x > static_cast<T>(max_val) ? static_cast<T>(max_val) : x);
 160}
 161
 162template<typename T>
 163static __dpct_inline__ T op_floor(T x) {
 164    return sycl::floor(x);
 165}
 166
 167template<typename T>
 168static __dpct_inline__ T op_ceil(T x) {
 169    return sycl::ceil(x);
 170}
 171
 172template<typename T>
 173static __dpct_inline__ T op_round(T x) {
 174    return sycl::round(x);
 175}
 176
 177template<typename T>
 178static __dpct_inline__ T op_trunc(T x) {
 179    return sycl::trunc(x);
 180}
 181
 182template<typename T, typename F>
 183static void unary_op_generic_kernel(
 184        const T * x,
 185        T * dst,
 186        const int k,
 187        const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3,
 188        const size_t nb0,  const size_t nb1,  const size_t nb2,  const size_t nb3,
 189        const size_t nbd0, const size_t nbd1, const size_t nbd2, const size_t nbd3,
 190        const sycl::nd_item<1> & item_ct1,
 191        F func) {
 192
 193        (void) ne3;
 194    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 195        const int64_t i0 =  i % ne0;
 196        const int64_t i1 = (i / ne0)        % ne1;
 197        const int64_t i2 = (i / (ne0*ne1))  % ne2;
 198        const int64_t i3 =  i / (ne0*ne1*ne2);
 199
 200        const char * src_base = (const char *) x;
 201        char       * dst_base = (char *) dst;
 202
 203        const T * srcp = (const T *)(src_base + i0*nb0  + i1*nb1  + i2*nb2  + i3*nb3 );
 204        T *       dstp = (T *)(dst_base + i0*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3);
 205
 206        *dstp = func(*srcp);
 207    }
 208}
 209
 210template<typename T>
 211static void unary_op_sqrt_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
 212    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 213        dst[i] = op_sqrt(x[i]);
 214    }
 215}
 216
 217template<typename T>
 218static void unary_op_sin_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
 219    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 220        dst[i] = op_sin(x[i]);
 221    }
 222}
 223
 224template<typename T>
 225static void unary_op_cos_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
 226    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 227        dst[i] = op_cos(x[i]);
 228    }
 229}
 230
 231template<typename T>
 232static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
 233    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 234        dst[i] = op_log(x[i]);
 235    }
 236}
 237
 238
 239template<typename T>
 240static void unary_op_leaky_relu_kernel(const T * x, T * dst, const int k, float negative_slope, const sycl::nd_item<1> &item_ct1) {
 241    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 242        dst[i] = op_leaky_relu(x[i], negative_slope);
 243    }
 244}
 245
 246template<typename T>
 247static void unary_op_sqr_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
 248    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 249        dst[i] = op_sqr(x[i]);
 250    }
 251}
 252
 253template<typename T>
 254static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1, float min_val, float max_val) {
 255    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 256        dst[i] = op_clamp(x[i], min_val, max_val);
 257    }
 258}
 259
 260template<typename T>
 261static void unary_op_floor_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
 262    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 263        dst[i] = op_floor(x[i]);
 264    }
 265}
 266
 267template<typename T>
 268static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
 269    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 270        dst[i] = op_ceil(x[i]);
 271    }
 272}
 273
 274template<typename T>
 275static void unary_op_round_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
 276    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 277        dst[i] = op_round(x[i]);
 278    }
 279}
 280
 281template<typename T>
 282static void unary_op_trunc_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
 283    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 284        dst[i] = op_trunc(x[i]);
 285    }
 286}
 287
 288template<typename  T>
 289static void upscale(const T  *x, T *dst, const int nb00, const int nb01,
 290                        const int nb02, const int nb03, const int ne10, const int ne11,
 291                        const int ne12, const int ne13, const float sf0, const float sf1,
 292                        const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
 293    int index = item_ct1.get_local_id(0) +
 294               item_ct1.get_group(0) * item_ct1.get_local_range(0);
 295    if (index >= ne10 * ne11 * ne12 * ne13) {
 296        return;
 297    }
 298    // operation
 299    int i10 = index % ne10;
 300    int i11 = (index / ne10) % ne11;
 301    int i12 = (index / (ne10 * ne11)) % ne12;
 302    int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
 303
 304    int i00 = static_cast<int>(i10 / sf0);
 305    int i01 = static_cast<int>(i11 / sf1);
 306    int i02 = static_cast<int>(i12 / sf2);
 307    int i03 = static_cast<int>(i13 / sf3);
 308
 309    dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
 310}
 311
 312template<typename T>
 313static void clamp(const T * x, T * dst, const float min, const float max, const int k,
 314                      const sycl::nd_item<1> &item_ct1) {
 315    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 316        dst[i] = x[i] < static_cast<T>(min) ? static_cast<T>(min) : (x[i] > static_cast<T>(max) ? static_cast<T>(max) : x[i]);
 317    }
 318}
 319
 320template<typename T>
 321static void gated_op_fused_geglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
 322    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 323        const int64_t j0 = (i / n) * o0 + (i % n);
 324        const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
 325        dst[i] = op_gelu(x[j0]) * g[j1];
 326    }
 327}
 328
 329template<typename T>
 330static void gated_op_fused_reglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
 331    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 332        const int64_t j0 = (i / n) * o0 + (i % n);
 333        const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
 334        dst[i] = op_relu(x[j0]) * g[j1];
 335    }
 336}
 337
 338template<typename T>
 339static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
 340    SYCL_GLOBAL_ID_LOOP(k, item_ct1)  {
 341        const int64_t j0 = (i / n) * o0 + (i % n);
 342        const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
 343        dst[i] = op_silu(x[j0]) * g[j1];
 344    }
 345}
 346
 347template<typename T>
 348static void gated_op_fused_geglu_erf(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
 349    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 350        const int64_t j0 = (i / n) * o0 + (i % n);
 351        const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
 352        dst[i] = op_gelu_erf(x[j0]) * g[j1];
 353    }
 354}
 355
 356template<typename T>
 357static void gated_op_fused_geglu_quick(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
 358    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 359        const int64_t j0 = (i / n) * o0 + (i % n);
 360        const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
 361        dst[i] = op_gelu_quick(x[j0]) * g[j1];
 362    }
 363}
 364
 365namespace ggml_sycl_detail {
 366static void acc_f32_sycl(const float *x, const float *y, float *dst,
 367                         const int n_elements, const int ne10, const int ne11,
 368                         const int ne12, const int nb1, const int nb2,
 369                         const int offset, queue_ptr stream) {
 370    int num_blocks = ceil_div(n_elements, SYCL_ACC_BLOCK_SIZE);
 371    stream->parallel_for(
 372        sycl::nd_range<1>(sycl::range<1>(num_blocks) *
 373                              sycl::range<1>(SYCL_ACC_BLOCK_SIZE),
 374                          sycl::range<1>(SYCL_ACC_BLOCK_SIZE)),
 375        [=](sycl::nd_item<1> item_ct1) {
 376            acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
 377                    item_ct1);
 378        });
 379}
 380
 381template<typename T>
 382static void arange_kernel(T * dst, const int k, T start, T step,
 383                         const sycl::nd_item<1> &item_ct1) {
 384    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
 385        dst[i] = start + static_cast<T>(i) * step;
 386    }
 387}
 388
 389template<typename T>
 390static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
 391                             const int nb02, const int nb03, const int ne10, const int ne11,
 392                             const int ne12, const int ne13, const float sf0, const float sf1,
 393                             const float sf2, const float sf3, queue_ptr stream) {
 394    int dst_size = ne10 * ne11 * ne12 * ne13;
 395    int num_blocks = ceil_div(dst_size, SYCL_UPSCALE_BLOCK_SIZE);
 396    sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
 397    stream->parallel_for(
 398        sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
 399            upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
 400        });
 401}
 402
 403template<typename KernelInvoker, typename... Args>
 404static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
 405#if defined (GGML_SYCL_F16)
 406    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
 407    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
 408#else
 409    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
 410    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 411#endif
 412    GGML_ASSERT(dst->src[0]->type == dst->type);
 413    dpct::queue_ptr main_stream = ctx.stream();
 414    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
 415    switch (dst->type) {
 416#if defined (GGML_SYCL_F16)
 417        case GGML_TYPE_F16:
 418            {
 419                auto data_pts = cast_data<sycl::half>(dst);
 420                kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward<Args>(args)...);
 421                break;
 422            }
 423#endif
 424        case GGML_TYPE_F32:
 425            {
 426                auto data_pts = cast_data<float>(dst);
 427                kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward<Args>(args)...);
 428                break;
 429            }
 430        default:
 431            GGML_ABORT("GGML tensor type not supported!\n");
 432    }
 433}
 434
 435template<typename KernelInvoker, typename... Args>
 436static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
 437#if defined (GGML_SYCL_F16)
 438    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
 439    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
 440#else
 441    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
 442    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 443#endif
 444    GGML_ASSERT(dst->src[0]->type == dst->type);
 445    dpct::queue_ptr main_stream = ctx.stream();
 446    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
 447    const ggml_tensor * src0 = dst->src[0];
 448    const ggml_tensor * src1 = dst->src[1];
 449    const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;;
 450    GGML_ASSERT(dst->ne[0] == nc);
 451    GGML_ASSERT(ggml_is_contiguous_1(dst->src[0]));
 452    GGML_ASSERT(ggml_is_contiguous(dst));
 453    const int32_t swapped = ((const int32_t *) dst->op_params)[1];
 454    void * src0_d = src0->data;
 455    void * src1_d = src1 ? src1->data : src0->data;
 456    const int64_t src0_o = src0->nb[1];
 457    const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
 458    void * dst_d = dst->data;
 459    if (src1) {
 460        GGML_ASSERT(ggml_is_contiguous_1(src1));
 461        GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
 462        GGML_ASSERT(src1->ne[0] == nc);
 463        GGML_ASSERT(src0->type == src1->type);
 464    }
 465    switch (dst->type) {
 466#if defined (GGML_SYCL_F16)
 467        case GGML_TYPE_F16:
 468            {
 469                sycl::half * src0_p = (sycl::half *) src0_d;
 470                sycl::half * src1_p = (sycl::half *) src1_d;
 471
 472                    if (!src1) {
 473                        src0_p += swapped ? nc : 0;
 474                        src1_p += swapped ? 0 : nc;
 475                    }
 476                kernel_invoker(src0_p,
 477                               src1_p,
 478                               (sycl::half *) dst_d,
 479                               ggml_nelements(dst),
 480                               nc,
 481                               src0_o / sizeof(sycl::half),
 482                               src1_o / sizeof(sycl::half),
 483                               main_stream,
 484                               std::forward<Args>(args)...);
 485                break;
 486            }
 487#endif
 488        case GGML_TYPE_F32:
 489            {
 490                float * src0_p = (float *) src0_d;
 491                float * src1_p = (float *) src1_d;
 492
 493                    if (!src1) {
 494                        src0_p += swapped ? nc : 0;
 495                        src1_p += swapped ? 0 : nc;
 496                    }
 497
 498                kernel_invoker(src0_p,
 499                               src1_p,
 500                               (float *) dst_d,
 501                               ggml_nelements(dst),
 502                               nc,
 503                               src0_o / sizeof(float),
 504                               src1_o / sizeof(float),
 505                               main_stream,
 506                               std::forward<Args>(args)...);
 507                break;
 508            }
 509        default:
 510            GGML_ABORT("GGML tensor type not supported!\n");
 511    }
 512}
 513
 514template<typename KernelInvoker, typename... Args>
 515static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
 516#if defined (GGML_SYCL_F16)
 517    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
 518    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
 519#else
 520    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
 521    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 522#endif
 523    GGML_ASSERT(dst->src[0]->type == dst->type);
 524
 525    dpct::queue_ptr main_stream = ctx.stream();
 526    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
 527
 528    const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0];
 529    const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1];
 530    const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2];
 531    const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3];
 532    switch (dst->type) {
 533#if defined (GGML_SYCL_F16)
 534        case GGML_TYPE_F16:
 535            {
 536                auto data_pts = cast_data<sycl::half>(dst);
 537                kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2],
 538                               (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3,
 539                               main_stream, std::forward<Args>(args)...);
 540                break;
 541            }
 542#endif
 543        case GGML_TYPE_F32:
 544            {
 545                auto data_pts = cast_data<float>(dst);
 546                kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2],
 547                               (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3,
 548                               main_stream, std::forward<Args>(args)...);
 549                break;
 550            }
 551        default:
 552            GGML_ABORT("GGML tensor type not supported!\n");
 553    }
 554}
 555
 556template<typename F>
 557static inline void ggml_sycl_op_unary(
 558        ggml_backend_sycl_context & ctx, ggml_tensor * dst, F func) {
 559
 560    ggml_tensor * src0 = dst->src[0];
 561
 562    const int64_t ne0  = dst->ne[0];
 563    const int64_t ne1  = dst->ne[1];
 564    const int64_t ne2  = dst->ne[2];
 565    const int64_t ne3  = dst->ne[3];
 566
 567    const size_t  nb0  = src0->nb[0];
 568    const size_t  nb1  = src0->nb[1];
 569    const size_t  nb2  = src0->nb[2];
 570    const size_t  nb3  = src0->nb[3];
 571
 572    const size_t  nbd0 = dst->nb[0];
 573    const size_t  nbd1 = dst->nb[1];
 574    const size_t  nbd2 = dst->nb[2];
 575    const size_t  nbd3 = dst->nb[3];
 576
 577    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
 578        [=](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
 579
 580            const int num_blocks = ceil_div(k_elements, 256);
 581
 582            stream->parallel_for(
 583                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
 584                                  sycl::range<1>(256)),
 585                [=](sycl::nd_item<1> item_ct1) {
 586                    unary_op_generic_kernel(
 587                        src, dst_ptr, k_elements,
 588                        ne0, ne1, ne2, ne3,
 589                        nb0, nb1, nb2, nb3,
 590                        nbd0, nbd1, nbd2, nbd3,
 591                        item_ct1,
 592                        func
 593                    );
 594                });
 595        });
 596}
 597
 598
 599static inline void ggml_sycl_op_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 600    GGML_ASSERT(dst->type == GGML_TYPE_F32);
 601    float start, stop, step;
 602    memcpy(&start, dst->op_params, sizeof(float));
 603    memcpy(&stop, (float *) dst->op_params + 1, sizeof(float));
 604    memcpy(&step, (float *) dst->op_params + 2, sizeof(float));
 605    dpct::queue_ptr stream = ctx.stream();
 606    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
 607    float * dst_ptr = (float *)dst->data;
 608    const int k = (int)ggml_nelements(dst);
 609    const int num_blocks = ceil_div(k, SYCL_ARANGE_BLOCK_SIZE);
 610    stream->parallel_for(
 611        sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_ARANGE_BLOCK_SIZE),
 612                          sycl::range<1>(SYCL_ARANGE_BLOCK_SIZE)),
 613        [=](sycl::nd_item<1> item_ct1) {
 614            arange_kernel(dst_ptr, k, start, step, item_ct1);
 615        });
 616}
 617
 618} // namespace ggml_sycl_detail
 619
 620
 621
 622static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 623    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 624        return op_sgn(x);
 625    });
 626}
 627
 628
 629static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 630    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 631        return op_abs(x);
 632    });
 633}
 634
 635static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 636    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 637        return op_elu(x);
 638    });
 639}
 640static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 641    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 642        return op_silu(x);
 643    });
 644}
 645
 646static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 647    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 648        return op_gelu(x);
 649    });
 650}
 651
 652static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 653    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 654        return op_gelu_quick(x);
 655    });
 656}
 657
 658static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 659    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 660        return op_gelu_erf(x);
 661    });
 662}
 663
 664static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 665    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 666        return op_tanh(x);
 667    });
 668}
 669
 670static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 671    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 672        return op_relu(x);
 673    });
 674}
 675
 676static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 677    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 678        return op_hardsigmoid(x);
 679    });
 680}
 681
 682static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 683    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 684        return op_hardswish(x);
 685    });
 686}
 687
 688static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 689    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 690        return op_exp(x);
 691    });
 692}
 693
 694static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 695    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
 696        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
 697            const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size
 698            stream->parallel_for(
 699                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
 700                                  sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
 701                [=](sycl::nd_item<1> item_ct1) {
 702                    unary_op_log_kernel(src, dst_ptr, k_elements, item_ct1);
 703                });
 704        });
 705}
 706
 707static inline void ggml_sycl_op_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 708    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 709        return op_softplus(x);
 710    });
 711}
 712
 713static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 714    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 715        return op_neg(x);
 716    });
 717}
 718
 719
 720static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 721    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 722        return op_step(x);
 723    });
 724}
 725
 726static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 727    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 728        return op_sigmoid(x);
 729    });
 730}
 731
 732static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 733    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
 734        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
 735            const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE);
 736            stream->parallel_for(
 737                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE),
 738                                  sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)),
 739                [=](sycl::nd_item<1> item_ct1) {
 740                    unary_op_sqrt_kernel(src, dst_ptr, k_elements, item_ct1);
 741                });
 742        });
 743}
 744
 745static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 746    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
 747        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
 748            const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE);
 749            stream->parallel_for(
 750                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
 751                                  sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
 752                [=](sycl::nd_item<1> item_ct1) {
 753                    unary_op_sin_kernel(src, dst_ptr, k_elements, item_ct1);
 754                });
 755        });
 756}
 757
 758static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 759    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
 760        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
 761            const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size
 762            stream->parallel_for(
 763                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
 764                                  sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
 765                [=](sycl::nd_item<1> item_ct1) {
 766                    unary_op_cos_kernel(src, dst_ptr, k_elements, item_ct1);
 767                });
 768        });
 769}
 770
 771static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 772    float negative_slope;
 773    memcpy(&negative_slope, dst->op_params, sizeof(float));
 774    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
 775        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float slope) {
 776            const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
 777            stream->parallel_for(
 778                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
 779                                  sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
 780                [=](sycl::nd_item<1> item_ct1) {
 781                    unary_op_leaky_relu_kernel(src, dst_ptr, k_elements, slope, item_ct1);
 782                });
 783        }, negative_slope);
 784}
 785
 786static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 787    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
 788        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
 789            const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE);
 790            stream->parallel_for(
 791                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE),
 792                                  sycl::range<1>(SYCL_SQR_BLOCK_SIZE)),
 793                [=](sycl::nd_item<1> item_ct1) {
 794                    unary_op_sqr_kernel(src, dst_ptr, k_elements, item_ct1);
 795                });
 796        });
 797}
 798
 799static inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 800    ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst,
 801        [](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03,
 802           int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3,
 803           queue_ptr stream) {
 804            ggml_sycl_detail::upscale_sycl(src, dst_ptr, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, stream);
 805        });
 806}
 807
 808static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 809    float min_val;
 810    float max_val;
 811    memcpy(&min_val, dst->op_params, sizeof(float));
 812    memcpy(&max_val, (float *) dst->op_params + 1, sizeof(float));
 813    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
 814        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float min_arg, float max_arg) {
 815            const int num_blocks = ceil_div(k_elements, SYCL_CLAMP_BLOCK_SIZE);
 816            stream->parallel_for(
 817                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE),
 818                                  sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE)),
 819                [=](sycl::nd_item<1> item_ct1) {
 820                    clamp(src, dst_ptr, min_arg, max_arg, k_elements, item_ct1);
 821                });
 822        }, min_val, max_val);
 823}
 824
 825static inline void ggml_sycl_op_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 826    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
 827        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
 828            const int num_blocks = ceil_div(k_elements, 256);
 829            stream->parallel_for(
 830                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
 831                                  sycl::range<1>(256)),
 832                [=](sycl::nd_item<1> item_ct1) {
 833                    unary_op_floor_kernel(src, dst_ptr, k_elements, item_ct1);
 834                });
 835        });
 836}
 837
 838static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 839    ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) {
 840        return op_ceil(x);
 841    });
 842}
 843
 844static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 845    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
 846        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
 847            const int num_blocks = ceil_div(k_elements, 256);
 848            stream->parallel_for(
 849                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
 850                                  sycl::range<1>(256)),
 851                [=](sycl::nd_item<1> item_ct1) {
 852                    unary_op_round_kernel(src, dst_ptr, k_elements, item_ct1);
 853                });
 854        });
 855}
 856
 857static inline void ggml_sycl_op_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 858    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
 859        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
 860            const int num_blocks = ceil_div(k_elements, 256);
 861            stream->parallel_for(
 862                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
 863                                  sycl::range<1>(256)),
 864                [=](sycl::nd_item<1> item_ct1) {
 865                    unary_op_trunc_kernel(src, dst_ptr, k_elements, item_ct1);
 866                });
 867        });
 868}
 869
 870static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
 871    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
 872    GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
 873    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 874    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
 875    dpct::queue_ptr main_stream = ctx.stream();
 876    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
 877    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
 878    const float * src1_dd = static_cast<const float*>(dst->src[1]->data);
 879    float *       dst_dd  = static_cast<float *>(dst->data);
 880
 881    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
 882    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
 883    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
 884    int offset = dst->op_params[3] / 4; // offset in bytes
 885
 886    ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
 887}
 888
 889static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 890    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
 891        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
 892            const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
 893            main_stream->parallel_for(
 894                    sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
 895                gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
 896            });
 897        });
 898}
 899
 900static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 901    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
 902        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
 903            const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
 904            main_stream->parallel_for(
 905                    sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
 906                gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
 907            });
 908        });
 909}
 910
 911static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 912    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
 913        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
 914            const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu
 915            main_stream->parallel_for(
 916                    sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
 917                gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
 918            });
 919        });
 920}
 921
 922__dpct_inline__ float ggml_sycl_op_swiglu_oai_single(float x, float g, float alpha = 1.702f, float limit = 7.0f) {
 923    x = sycl::fmin(x, limit);
 924    g = sycl::fmax(sycl::fmin(g, limit), -limit);
 925
 926    float out_glu = x / (1.0f + sycl::native::exp(-x * alpha));
 927    out_glu = out_glu * (1.0f + g);
 928    return out_glu;
 929}
 930
 931
 932template <typename T>
 933static void swiglu_oai_kernel(const T * x, const T * g, T * dst, const int64_t k,
 934                              const int64_t n, const int64_t o0, const int64_t o1,
 935                              float alpha, float limit, sycl::nd_item<3> item_ct1) {
 936    const int64_t i = int64_t(item_ct1.get_local_range(2)) * item_ct1.get_group(2) + item_ct1.get_local_id(2);
 937
 938    if (i >= k) {
 939        return;
 940    }
 941
 942    const int64_t j0 = (i / n) * o0 + (i % n);
 943    const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
 944
 945    float xi = x[j0];
 946    float gi = g[j1];
 947
 948    dst[i] = ggml_sycl_op_swiglu_oai_single(xi, gi, alpha, limit);
 949}
 950
 951template <typename T>
 952static void swiglu_oai_sycl(const T *       x,
 953                            const T *       g,
 954                            T *             dst,
 955                            const int64_t   k,
 956                            const int64_t   n,
 957                            const int64_t   o0,
 958                            const int64_t   o1,
 959                            const float     alpha,
 960                            const float     limit,
 961                            dpct::queue_ptr stream) {
 962    const int64_t num_blocks = (k + SYCL_GLU_BLOCK_SIZE - 1) / SYCL_GLU_BLOCK_SIZE;
 963    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE),
 964                                           sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE)),
 965                         [=](sycl::nd_item<3> item_ct1) {
 966                             swiglu_oai_kernel(x, g, dst, k, n, o0, o1, alpha, limit, item_ct1);
 967                         });
 968}
 969
 970void ggml_sycl_op_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 971    const ggml_tensor * src0 = dst->src[0];
 972    const ggml_tensor * src1 = dst->src[1];
 973    void * src0_d = src0->data;
 974    void * src1_d = src1 ? src1->data : src0->data;
 975    const int64_t src0_o = src0->nb[1];
 976    const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1];
 977    void * dst_d = dst->data;
 978    const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;
 979    dpct::queue_ptr     stream = ctx.stream();
 980
 981    GGML_ASSERT(ggml_is_contiguous_1(src0));
 982    GGML_ASSERT(src0->nb[0] == ggml_element_size(src0));
 983    GGML_ASSERT(ggml_is_contiguous(dst));
 984
 985    GGML_ASSERT(src0->type == GGML_TYPE_F32);
 986    GGML_ASSERT( dst->type == GGML_TYPE_F32);
 987    GGML_ASSERT(src0->type == dst->type);
 988    GGML_ASSERT(dst->ne[0] == nc);
 989    GGML_ASSERT(ggml_nrows(dst) == ggml_nrows(src0));
 990
 991    if (src1) {
 992        GGML_ASSERT(ggml_is_contiguous_1(src1));
 993        GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
 994        GGML_ASSERT(src1->ne[0] == nc);
 995        GGML_ASSERT(src0->type == src1->type);
 996    }
 997
 998    //const int32_t swapped = ((const int32_t *) dst->op_params)[1];
 999    const int32_t swapped = ggml_get_op_params_i32(dst, 1);
1000    const float alpha = ggml_get_op_params_f32(dst, 2);
1001    const float limit = ggml_get_op_params_f32(dst, 3);
1002
1003    float * src0_p = (float *) src0_d;
1004    float * src1_p = (float *) src1_d;
1005
1006    if (!src1) {
1007        src0_p += swapped ? nc : 0;
1008        src1_p += swapped ? 0 : nc;
1009    }
1010
1011    swiglu_oai_sycl(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), alpha, limit, stream);
1012}
1013
1014static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1015    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
1016        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
1017            const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
1018            main_stream->parallel_for(
1019                    sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
1020                gated_op_fused_geglu_erf(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
1021            });
1022        });
1023}
1024
1025static inline void ggml_sycl_op_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1026    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
1027        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
1028            const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
1029            main_stream->parallel_for(
1030                    sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
1031                gated_op_fused_geglu_quick(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
1032            });
1033        });
1034}
1035
1036
1037void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1038    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1039    ggml_sycl_op_sqrt(ctx, dst);
1040}
1041
1042void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1043    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1044    ggml_sycl_op_sin(ctx, dst);
1045}
1046
1047void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1048    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1049    ggml_sycl_op_cos(ctx, dst);
1050}
1051
1052void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1053    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
1054    ggml_sycl_op_acc(ctx, dst);
1055}
1056
1057void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1058    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1059    ggml_sycl_op_gelu(ctx, dst);
1060}
1061
1062void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1063    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1064    ggml_sycl_op_silu(ctx, dst);
1065}
1066
1067void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1068    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1069    ggml_sycl_op_gelu_quick(ctx, dst);
1070}
1071
1072void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1073    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1074    ggml_sycl_op_gelu_erf(ctx, dst);
1075}
1076
1077void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1078    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1079    ggml_sycl_op_tanh(ctx, dst);
1080}
1081
1082void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1083    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1084    ggml_sycl_op_relu(ctx, dst);
1085}
1086
1087void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1088    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1089    ggml_sycl_op_sigmoid(ctx, dst);
1090}
1091
1092void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1093    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1094    ggml_sycl_op_hardsigmoid(ctx, dst);
1095}
1096
1097void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1098    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1099    ggml_sycl_op_hardswish(ctx, dst);
1100}
1101
1102void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1103    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1104    ggml_sycl_op_exp(ctx, dst);
1105}
1106
1107void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1108    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1109    ggml_sycl_op_log(ctx, dst);
1110}
1111
1112void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1113    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1114    ggml_sycl_op_softplus(ctx, dst);
1115}
1116
1117void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1118    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1119    ggml_sycl_op_neg(ctx, dst);
1120}
1121
1122void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1123    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1124    ggml_sycl_op_step(ctx, dst);
1125}
1126
1127void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1128    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1129    ggml_sycl_op_leaky_relu(ctx, dst);
1130}
1131
1132void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1133    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1134    ggml_sycl_op_sqr(ctx, dst);
1135}
1136
1137void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1138    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1139    ggml_sycl_op_upscale(ctx, dst);
1140}
1141
1142
1143void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1144    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1145    ggml_sycl_op_clamp(ctx, dst);
1146}
1147
1148void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1149    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1150    ggml_sycl_op_sgn(ctx, dst);
1151}
1152
1153void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1154    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1155    ggml_sycl_op_abs(ctx, dst);
1156}
1157
1158void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1159    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1160    ggml_sycl_op_elu(ctx, dst);
1161}
1162
1163void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1164    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1165    ggml_sycl_op_geglu(ctx, dst);
1166}
1167
1168void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1169    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1170    ggml_sycl_op_reglu(ctx, dst);
1171}
1172
1173void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1174    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1175    ggml_sycl_op_swiglu(ctx, dst);
1176}
1177
1178void ggml_sycl_swiglu_oai(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1179    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1180    ggml_sycl_op_swiglu_oai(ctx, dst);
1181}
1182
1183void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1184    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1185    ggml_sycl_op_geglu_erf(ctx, dst);
1186}
1187
1188void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1189    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1190    ggml_sycl_op_geglu_quick(ctx, dst);
1191}
1192
1193void ggml_sycl_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1194    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/0);
1195    ggml_sycl_detail::ggml_sycl_op_arange(ctx, dst);
1196}
1197
1198void ggml_sycl_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1199    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1200    ggml_sycl_op_floor(ctx, dst);
1201}
1202
1203void ggml_sycl_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1204    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1205    ggml_sycl_op_ceil(ctx, dst);
1206}
1207
1208void ggml_sycl_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1209    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1210    ggml_sycl_op_round(ctx, dst);
1211}
1212
1213void ggml_sycl_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1214    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
1215    ggml_sycl_op_trunc(ctx, dst);
1216}