1//
   2// MIT license
   3// Copyright (C) 2025 Intel Corporation
   4// SPDX-License-Identifier: MIT
   5//
   6
   7//
   8// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   9// See https://llvm.org/LICENSE.txt for license information.
  10// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11//
  12
  13#ifndef GGML_SYCL_VECDOTQ_HPP
  14#define GGML_SYCL_VECDOTQ_HPP
  15
  16#include "dpct/helper.hpp"
  17#include "ggml.h"
  18#include "quants.hpp"
  19
  20typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
  21                                  const int & iqs);
  22
  23static __dpct_inline__ int get_int_b1(const void * x, const int & i32) {
  24    const uint8_t * x8 = (const uint8_t *) x;
  25
  26    int x32  = x8[4*i32 + 0] <<  0;
  27    x32     |= x8[4*i32 + 1] <<  8;
  28    x32     |= x8[4*i32 + 2] << 16;
  29    x32     |= x8[4*i32 + 3] << 24;
  30
  31    return x32;
  32}
  33
  34
  35static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
  36  const uint16_t* x16 =
  37      (const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte
  38                                                 // alignment
  39
  40  int x32 = 0;
  41  x32 |= x16[0] << 0;
  42  x32 |= x16[1] << 16;
  43
  44  return x32;
  45}
  46
  47static __dpct_inline__ int get_int_from_uint8(
  48    const uint8_t* x8,
  49    const int& i32) {
  50  const uint16_t* x16 =
  51      (const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte
  52                                                 // alignment
  53
  54  int x32 = 0;
  55  x32 |= x16[0] << 0;
  56  x32 |= x16[1] << 16;
  57
  58  return x32;
  59}
  60
  61static __dpct_inline__ int get_int_from_int8_aligned(
  62    const int8_t* x8,
  63    const int& i32) {
  64  return *(
  65      (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
  66}
  67
  68static __dpct_inline__ int get_int_from_uint8_aligned(
  69    const uint8_t* x8,
  70    const int& i32) {
  71  return *(
  72      (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
  73}
  74
  75static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
  76                                                  const uint8_t *values,
  77                                                  int &val1, int &val2) {
  78
  79    uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
  80    aux32 = q4 & 0x0f0f0f0f;
  81    uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
  82    uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
  83    val1 = v1 | (v2 << 16);
  84    aux32 = (q4 >> 4) & 0x0f0f0f0f;
  85    v1 = values[q8[0]] | (values[q8[1]] << 8);
  86    v2 = values[q8[2]] | (values[q8[3]] << 8);
  87    val2 = v1 | (v2 << 16);
  88}
  89
  90static __dpct_inline__ sycl::int2 get_int_from_table_16(
  91    const int& q4, const int8_t* table) {
  92  const uint32_t* table32 = (const uint32_t*)table;
  93  uint32_t tmp[2];
  94  const uint32_t low_high_selection_indices =
  95      (0x32103210 | ((q4 & 0x88888888) >> 1));
  96#pragma unroll
  97  for (uint32_t i = 0; i < 2; ++i) {
  98    const uint32_t shift = 16 * i;
  99
 100    const uint32_t low =
 101        dpct::byte_level_permute(table32[0], table32[1], q4 >> shift);
 102    const uint32_t high =
 103        dpct::byte_level_permute(table32[2], table32[3], q4 >> shift);
 104    tmp[i] = dpct::byte_level_permute(
 105        low, high, low_high_selection_indices >> shift);
 106  }
 107  return sycl::int2(
 108      dpct::byte_level_permute(tmp[0], tmp[1], 0x6420),
 109      dpct::byte_level_permute(tmp[0], tmp[1], 0x7531));
 110}
 111
 112#define VDR_Q2_K_Q8_1_MMVQ 1
 113
 114// contiguous v/x values
 115static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
 116    const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales,
 117    const sycl::half2 &dm2, const float *__restrict__ d8) {
 118
 119    float sumf_d = 0.0f;
 120    float sumf_m = 0.0f;
 121
 122#pragma unroll
 123    for (int i = 0; i < QR2_K; ++i) {
 124        const int sc = scales[2*i];
 125
 126        const int vi = (v >> (2*i)) & 0x03030303;
 127
 128        sumf_d +=
 129            d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
 130
 131        // fill int with 4x m
 132        int m = sc >> 4;
 133        m |= m <<  8;
 134        m |= m << 16;
 135        sumf_m += d8[i] *
 136                  dpct::dp4a(
 137                      m, u[i],
 138                      0); // multiply constant q2_K part with sum of q8_1 values
 139    }
 140
 141    const sycl::float2 dm2f =
 142        dm2.convert<float, sycl::rounding_mode::automatic>();
 143
 144    return dm2f.x() * sumf_d - dm2f.y() * sumf_m;
 145}
 146
 147
 148#define VDR_Q3_K_Q8_1_MMVQ 1
 149
 150// contiguous v/x values
 151static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
 152    const int &vl, const int &vh, const int *__restrict__ u,
 153    const uint8_t *__restrict__ scales, const int &scale_offset,
 154    const float &d3, const float *__restrict__ d8) {
 155
 156    float sumf = 0.0f;
 157
 158#pragma unroll
 159    for (int i = 0; i < QR3_K; ++i) {
 160        const int isc = scale_offset + 2*i;
 161
 162        const int isc_low = isc % (QK_K/32);
 163        const int sc_shift_low = 4 * (isc / (QK_K/32));
 164        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
 165
 166        const int isc_high = isc % (QK_K/64);
 167        const int sc_shift_high = 2 * (isc / (QK_K/64));
 168        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
 169
 170        const int sc = (sc_low | sc_high) - 32;
 171
 172        const int vil = (vl >> (2*i)) & 0x03030303;
 173
 174        const int vih = ((vh >> i) << 2) & 0x04040404;
 175
 176        const int vi =
 177            dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat());
 178
 179        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
 180    }
 181
 182    return d3 * sumf;
 183}
 184
 185#define VDR_Q4_K_Q8_1_MMVQ 2
 186
 187// contiguous v/x values
 188static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
 189    const int *__restrict__ v, const int *__restrict__ u,
 190    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
 191    const sycl::half2 &dm4, const float *__restrict__ d8) {
 192
 193    float sumf_d = 0.0f;
 194    float sumf_m = 0.0f;
 195
 196#pragma unroll
 197    for (int i = 0; i < QR4_K; ++i) {
 198        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
 199        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
 200
 201        const int dot1 =
 202            dpct::dp4a(v1i, u[2 * i + 1],
 203                       dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product
 204        const int dot2 =
 205            dpct::dp4a(0x01010101, u[2 * i + 1],
 206                       dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u
 207
 208        sumf_d += d8[i] * (dot1 * sc[i]);
 209        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
 210    }
 211
 212    const sycl::float2 dm4f =
 213        dm4.convert<float, sycl::rounding_mode::automatic>();
 214
 215    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
 216}
 217
 218
 219#define VDR_Q5_K_Q8_1_MMVQ 2
 220
 221// contiguous v/x values
 222static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
 223    const int *__restrict__ vl, const int *__restrict__ vh,
 224    const int *__restrict__ u, const uint8_t *__restrict__ sc,
 225    const uint8_t *__restrict__ m, const sycl::half2 &dm5,
 226    const float *__restrict__ d8) {
 227
 228    float sumf_d = 0.0f;
 229    float sumf_m = 0.0f;
 230
 231#pragma unroll
 232    for (int i = 0; i < QR5_K; ++i) {
 233        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
 234        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
 235
 236        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
 237        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
 238
 239        const int v0i = vl0i | vh0i;
 240        const int v1i = vl1i | vh1i;
 241
 242        const int dot1 =
 243            dpct::dp4a(v0i, u[2 * i + 0],
 244                       dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product
 245        const int dot2 =
 246            dpct::dp4a(0x01010101, u[2 * i + 0],
 247                       dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u
 248
 249        sumf_d += d8[i] * (dot1 * sc[i]);
 250        sumf_m += d8[i] * (dot2 * m[i]);
 251
 252    }
 253
 254    const sycl::float2 dm5f =
 255        dm5.convert<float, sycl::rounding_mode::automatic>();
 256
 257    return dm5f.x() * sumf_d - dm5f.y() * sumf_m;
 258}
 259
 260
 261#define VDR_Q6_K_Q8_1_MMVQ 1
 262
 263// contiguous v/x values
 264static __dpct_inline__ float
 265vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
 266                            const int *__restrict__ u,
 267                            const int8_t *__restrict__ scales, const float &d,
 268                            const float *__restrict__ d8) {
 269
 270    float sumf = 0.0f;
 271
 272#pragma unroll
 273    for (int i = 0; i < QR6_K; ++i) {
 274        const int sc = scales[4*i];
 275
 276        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
 277
 278        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
 279
 280        const int vi = dpct::vectorized_binary<sycl::char4>(
 281            (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32
 282
 283        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
 284    }
 285
 286    return d*sumf;
 287}
 288
 289// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
 290// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
 291
 292template <ggml_type T> struct reorder_vec_dot_q_sycl {
 293    static_assert(T != T, "ggml_type for reorder vecdot not implemented");
 294};
 295
 296template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
 297    static constexpr ggml_type gtype = GGML_TYPE_Q4_0;
 298
 299    using q4_0_block  = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_0>;
 300    using q4_0_traits = typename q4_0_block::traits;
 301
 302    __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4, const sycl::half2 & ds8) {
 303        int sumi = 0;
 304
 305#pragma unroll
 306        for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
 307            const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
 308            const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
 309
 310            // SIMD dot product of quantized values
 311            sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
 312            sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
 313        }
 314
 315        const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
 316
 317        // second part effectively subtracts 8 from each quant value
 318        return d4 * (sumi * ds8f.x() - (8 * q4_0_traits::vdr_mmvq / q4_0_traits::qi) * ds8f.y());
 319    }
 320
 321    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
 322                                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
 323                                     const sycl::half2 * q8_1_ds, const int & iqs) {
 324        const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset.first;
 325        const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
 326        int             v[q4_0_traits::vdr_mmvq];
 327        int             u[2 * q4_0_traits::vdr_mmvq];
 328
 329
 330#pragma unroll
 331        for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
 332            v[i]         = get_int_from_uint8(bq4_0, iqs + i);
 333            u[2 * i + 0] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
 334            u[2 * i + 1] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i + q4_0_traits::qi);
 335        }
 336
 337        return vec_dot_q4_0_q8_1_impl(v, u, d, *q8_1_ds);
 338    };
 339};
 340
 341static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
 342                                             const ggml_half2 & dm, const block_q8_1 * __restrict__ bq8_1,
 343                                             const int &        iqs) {
 344    int   v[2];
 345    int   u[2 * QR4_K];
 346    float d8[QR4_K];
 347
 348    v[0] = q4[0];
 349    v[1] = q4[4];
 350
 351    uint16_t  aux[2];
 352    const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
 353    if (j < 2) {
 354        aux[0] = scales[j + 0] & 0x3f3f;
 355        aux[1] = scales[j + 2] & 0x3f3f;
 356    } else {
 357        aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
 358        aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
 359    }
 360
 361    const uint8_t * sc = (const uint8_t *) aux;
 362    const uint8_t * m  = sc + 2;
 363
 364    const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
 365
 366    for (int i = 0; i < QR4_K; ++i) {
 367        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
 368        d8[i]                   = bq8i->ds[0];
 369
 370        const int * q8 = (const int *) bq8i->qs + ((iqs / 2) % 4);
 371        u[2 * i + 0]   = q8[0];
 372        u[2 * i + 1]   = q8[4];
 373    }
 374
 375    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, dm, d8);
 376}
 377
 378template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
 379    static constexpr ggml_type gtype = GGML_TYPE_Q4_K;
 380
 381    using q4_k_block  = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_K>;
 382    using q4_k_traits = typename q4_k_block::traits;
 383
 384    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
 385                                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
 386                                     const sycl::half2 * q8_1_ds, const int & iqs) {
 387        const uint8_t *    base           = static_cast<const uint8_t *>(vbq);
 388        const uint8_t *    qs             = base + ibx_offset.first;
 389        const uint8_t *    scs            = base + d_offset.first;
 390        const ggml_half2 * dms            = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
 391
 392        const int        bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
 393        const int *      q4         = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
 394        const uint16_t * scales     = (const uint16_t *) scs;
 395
 396        int   v[2];
 397        int   u[2 * QR4_K];
 398        float d8[QR4_K];
 399
 400        v[0] = q4[0];
 401        v[1] = q4[4];
 402
 403        uint16_t  aux[2];
 404        const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
 405        if (j < 2) {
 406            aux[0] = scales[j + 0] & 0x3f3f;
 407            aux[1] = scales[j + 2] & 0x3f3f;
 408        } else {
 409            aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
 410            aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
 411        }
 412
 413        const uint8_t * sc = (const uint8_t *) aux;
 414        const uint8_t * m  = sc + 2;
 415
 416        for (int i = 0; i < QR4_K; ++i) {
 417            const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
 418            sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
 419
 420            d8[i]                   = ds_values[0];
 421
 422            const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
 423            u[2 * i + 0]   = q8[0];
 424            u[2 * i + 1]   = q8[4];
 425        }
 426
 427        return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, *dms, d8);
 428    }
 429};
 430
 431template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
 432    static constexpr ggml_type gtype = GGML_TYPE_Q6_K;
 433
 434    using q6_k_block  = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q6_K>;
 435    using q6_k_traits = typename q6_k_block::traits;
 436
 437    __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(const int vl, const int vh, const int * __restrict__ u,
 438                                                      const int8_t * __restrict__ scales, const float d,
 439                                                      const float * __restrict__ d8) {
 440        float sumf = 0.0f;
 441
 442#pragma unroll
 443        for (int i = 0; i < QR6_K; ++i) {
 444            const int sc = scales[4 * i];
 445
 446            const int vil = (vl >> (4 * i)) & 0x0F0F0F0F;
 447
 448            const int vih = ((vh >> (4 * i)) << 4) & 0x30303030;
 449
 450            const int vi = dpct::vectorized_binary<sycl::char4>((vil | vih), 0x20202020,
 451                                                                dpct::sub_sat());  // vi = (vil | vih) - 32
 452
 453            sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc);                        // SIMD dot product
 454        }
 455
 456        return d * sumf;
 457    }
 458
 459    __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
 460                     const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds,
 461                     const int iqs) {
 462        const uint8_t *   base   = static_cast<const uint8_t *>(vbq);
 463        const uint8_t *   ql     = base + ibx_offset.first;
 464        const uint8_t *   qh     = base + ibx_offset.second;
 465        const int8_t *    scales = reinterpret_cast<const int8_t *>(base + d_offset.first);
 466        const ggml_half * d      = (const ggml_half *) (base + d_offset.second);
 467
 468        const int bq8_offset   = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4);
 469        const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8);
 470        const int vh_shift     = 2 * ((iqs % (QI6_K / 2)) / (QI6_K / 4));
 471
 472        const int vl = get_int_from_uint8(ql, iqs);
 473        const int vh = get_int_from_uint8(qh, (QI6_K / 4) * (iqs / (QI6_K / 2)) + iqs % (QI6_K / 4)) >> vh_shift;
 474
 475        const int8_t * scs = scales + scale_offset;
 476
 477        int   u[QR6_K];
 478        float d8[QR6_K];
 479
 480#pragma unroll
 481        for (int i = 0; i < QR6_K; ++i) {
 482            u[i] = get_int_from_int8_aligned(q8_1_quant_ptr + (bq8_offset + 2 * i) * QK8_1, iqs % QI8_1);
 483            const sycl::half2 ds_values = *(q8_1_ds + bq8_offset + 2 * i);
 484            d8[i]                       = ds_values[0];
 485        }
 486        return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scs, *d, d8);
 487    }
 488};
 489#define VDR_Q4_0_Q8_1_MMVQ 2
 490#define VDR_Q4_0_Q8_1_MMQ  4
 491
 492template <int vdr>
 493static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4,
 494                                                    const sycl::half2 & ds8) {
 495    int sumi = 0;
 496#pragma unroll
 497    for (int i = 0; i < vdr; ++i) {
 498        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
 499        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
 500
 501        // SIMD dot product of quantized values
 502        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
 503        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
 504    }
 505
 506    const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
 507
 508    // second part effectively subtracts 8 from each quant value
 509    return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
 510}
 511
 512#define VDR_Q4_1_Q8_1_MMVQ 2
 513#define VDR_Q4_1_Q8_1_MMQ  4
 514
 515template <int vdr>
 516static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
 517                                                    const sycl::half2 &dm4,
 518                                                    const sycl::half2 &ds8) {
 519
 520    int sumi = 0;
 521
 522#pragma unroll
 523    for (int i = 0; i < vdr; ++i) {
 524        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
 525        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
 526
 527        // SIMD dot product of quantized values
 528        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
 529        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
 530    }
 531
 532#ifdef GGML_SYCL_F16
 533    const sycl::float2 tmp =
 534        (dm4 * ds8).convert<float, sycl::rounding_mode::automatic>();
 535    const float d4d8 = tmp.x();
 536    const float m4s8 = tmp.y();
 537#else
 538    const sycl::float2 dm4f =
 539        dm4.convert<float, sycl::rounding_mode::automatic>();
 540    const sycl::float2 ds8f =
 541        ds8.convert<float, sycl::rounding_mode::automatic>();
 542    const float d4d8 = dm4f.x() * ds8f.x();
 543    const float m4s8 = dm4f.y() * ds8f.y();
 544#endif // GGML_SYCL_F16
 545
 546    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
 547    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
 548}
 549
 550#define VDR_Q5_0_Q8_1_MMVQ 2
 551#define VDR_Q5_0_Q8_1_MMQ  4
 552
 553template <int vdr>
 554static __dpct_inline__ float
 555vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
 556                       const float &d5, const sycl::half2 &ds8) {
 557    int sumi = 0;
 558
 559#pragma unroll
 560    for (int i = 0; i < vdr; ++i) {
 561        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
 562        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
 563        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
 564        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
 565        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
 566        sumi = dpct::dp4a(vi0, u[2 * i + 0],
 567                          sumi); // SIMD dot product of quantized values
 568
 569        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
 570        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
 571        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
 572        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
 573        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
 574        sumi = dpct::dp4a(vi1, u[2 * i + 1],
 575                          sumi); // SIMD dot product of quantized values
 576    }
 577
 578    const sycl::float2 ds8f =
 579        ds8.convert<float, sycl::rounding_mode::automatic>();
 580
 581    // second part effectively subtracts 16 from each quant value
 582    return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y());
 583}
 584
 585#define VDR_Q5_1_Q8_1_MMVQ 2
 586#define VDR_Q5_1_Q8_1_MMQ  4
 587
 588template <int vdr>
 589static __dpct_inline__ float
 590vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
 591                       const sycl::half2 &dm5, const sycl::half2 &ds8) {
 592
 593    int sumi = 0;
 594
 595#pragma unroll
 596    for (int i = 0; i < vdr; ++i) {
 597        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
 598        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
 599        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
 600        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
 601        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
 602        sumi = dpct::dp4a(vi0, u[2 * i + 0],
 603                          sumi); // SIMD dot product of quantized values
 604
 605        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
 606        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
 607        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
 608        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
 609        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
 610        sumi = dpct::dp4a(vi1, u[2 * i + 1],
 611                          sumi); // SIMD dot product of quantized values
 612    }
 613
 614#ifdef GGML_SYCL_F16
 615     const sycl::float2 tmp =
 616        (dm5 * ds8).convert<float, sycl::rounding_mode::automatic>();
 617    const float d5d8 = tmp.x();
 618    const float m5s8 = tmp.y();
 619
 620
 621#else
 622    const sycl::float2 dm5f =
 623        dm5.convert<float, sycl::rounding_mode::automatic>();
 624    const sycl::float2 ds8f =
 625        ds8.convert<float, sycl::rounding_mode::automatic>();
 626    const float d5d8 = dm5f.x() * ds8f.x();
 627    const float m5s8 = dm5f.y() * ds8f.y();
 628#endif // GGML_SYCL_F16
 629
 630    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
 631    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
 632}
 633
 634#define VDR_Q8_0_Q8_1_MMVQ 2
 635#define VDR_Q8_0_Q8_1_MMQ 8
 636
 637template <int vdr>
 638static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
 639                                                    const float &d8_0,
 640                                                    const float &d8_1) {
 641
 642    int sumi = 0;
 643
 644#pragma unroll
 645    for (int i = 0; i < vdr; ++i) {
 646        // SIMD dot product of quantized values
 647        sumi = dpct::dp4a(v[i], u[i], sumi);
 648    }
 649
 650    return d8_0*d8_1 * sumi;
 651}
 652
 653template <int vdr>
 654static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
 655                                                    const sycl::half2 &dm8,
 656                                                    const sycl::half2 &ds8) {
 657
 658    int sumi = 0;
 659
 660#pragma unroll
 661    for (int i = 0; i < vdr; ++i) {
 662        // SIMD dot product of quantized values
 663        sumi = dpct::dp4a(v[i], u[i], sumi);
 664    }
 665
 666#ifdef GGML_SYCL_F16
 667    const sycl::float2 tmp =
 668        (dm8 * ds8).convert<float, sycl::rounding_mode::automatic>();
 669    const float d8d8 = tmp.x();
 670    const float m8s8 = tmp.y();
 671#else
 672    const sycl::float2 dm8f =
 673        dm8.convert<float, sycl::rounding_mode::automatic>();
 674    const sycl::float2 ds8f =
 675        ds8.convert<float, sycl::rounding_mode::automatic>();
 676    const float d8d8 = dm8f.x() * ds8f.x();
 677    const float m8s8 = dm8f.y() * ds8f.y();
 678#endif // GGML_SYCL_F16
 679
 680    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
 681    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 682}
 683
 684static __dpct_inline__ float
 685vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
 686                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 687
 688    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
 689
 690    int v[VDR_Q4_0_Q8_1_MMVQ];
 691    int u[2 * VDR_Q4_0_Q8_1_MMVQ];
 692
 693#pragma unroll
 694    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
 695        v[i]         = get_int_from_uint8(bq4_0->qs, iqs + i);
 696        u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 697        u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
 698    }
 699
 700    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
 701}
 702
 703static __dpct_inline__ float
 704vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
 705                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 706
 707    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
 708
 709    int v[VDR_Q4_1_Q8_1_MMVQ];
 710    int u[2*VDR_Q4_1_Q8_1_MMVQ];
 711
 712#pragma unroll
 713    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
 714        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
 715        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 716        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
 717    }
 718
 719    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
 720}
 721
 722#define VDR_MXFP4_Q8_1_MMVQ 2
 723#define VDR_MXFP4_Q8_1_MMQ  4
 724
 725static __dpct_inline__ float vec_dot_mxfp4_q8_1(const void * __restrict__ vbq,
 726                                                const block_q8_1 * __restrict__ bq8_1,
 727                                                const int & iqs) {
 728    const block_mxfp4 * bq4 = (const block_mxfp4 *) vbq;
 729
 730    const int * q8 = (const int *) bq8_1->qs + iqs;
 731
 732    int sumi = 0;
 733#pragma unroll
 734    for (int l = 0; l < VDR_MXFP4_Q8_1_MMVQ; ++l) {
 735        const int aux_q4 = get_int_b1(bq4->qs, iqs + l);
 736        const sycl::int2 v      = get_int_from_table_16(aux_q4, kvalues_mxfp4);
 737        sumi = ggml_sycl_dp4a(v.x(), q8[l + 0], sumi);
 738        sumi = ggml_sycl_dp4a(v.y(), q8[l + 4], sumi);
 739    }
 740
 741    const float d = ggml_sycl_e8m0_to_fp32(bq4->e) * 0.5f * (bq8_1->ds)[0];
 742    return d * sumi;
 743}
 744
 745
 746static __dpct_inline__ float
 747vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
 748                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 749
 750    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
 751
 752    int vl[VDR_Q5_0_Q8_1_MMVQ];
 753    int vh[VDR_Q5_0_Q8_1_MMVQ];
 754    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
 755
 756#pragma unroll
 757    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
 758        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
 759        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
 760        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 761        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
 762    }
 763
 764    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
 765}
 766
 767static __dpct_inline__ float
 768vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
 769                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 770
 771    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
 772
 773    int vl[VDR_Q5_1_Q8_1_MMVQ];
 774    int vh[VDR_Q5_1_Q8_1_MMVQ];
 775    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
 776
 777#pragma unroll
 778    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
 779        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
 780        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
 781        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 782        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
 783    }
 784
 785    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
 786}
 787
 788static __dpct_inline__ float
 789vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
 790                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 791
 792    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
 793
 794    int v[VDR_Q8_0_Q8_1_MMVQ];
 795    int u[VDR_Q8_0_Q8_1_MMVQ];
 796
 797#pragma unroll
 798    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
 799        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
 800        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
 801    }
 802
 803    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d,
 804                                                      bq8_1->ds[0]);
 805}
 806
 807static __dpct_inline__ float
 808vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
 809                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 810
 811    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
 812
 813    const int bq8_offset = QR2_K * (iqs / QI8_1);
 814    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
 815
 816    const uint8_t * scales = bq2_K->scales + scale_offset;
 817
 818    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
 819    int    u[QR2_K];
 820    float d8[QR2_K];
 821
 822#pragma unroll
 823    for (int i = 0; i < QR2_K; ++ i) {
 824        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
 825        d8[i] = bq8_1[bq8_offset + i].ds[0];
 826    }
 827
 828    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
 829}
 830
 831static __dpct_inline__ float
 832vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
 833                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 834
 835    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
 836
 837    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
 838    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
 839
 840    const float d = bq3_K->d;
 841
 842    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
 843
 844    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
 845    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
 846
 847    int    u[QR3_K];
 848    float d8[QR3_K];
 849
 850#pragma unroll
 851    for (int i = 0; i < QR3_K; ++i) {
 852        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
 853        d8[i] = bq8_1[bq8_offset + i].ds[0];
 854    }
 855
 856    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
 857}
 858
 859static __dpct_inline__ float vec_dot_q4_K_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
 860                                               const int & iqs) {
 861#ifndef GGML_QKK_64
 862
 863    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
 864
 865    const int        bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
 866    const int *      q4         = (const int *) (bq4_K->qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
 867    const uint16_t * scales     = (const uint16_t *) bq4_K->scales;
 868
 869    return vec_dot_q4_K_q8_1_common(q4, scales, bq4_K->dm, bq8_1, iqs);
 870
 871#else
 872
 873#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
 874    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
 875
 876    float sumf_d = 0.0f;
 877    float sumf_m = 0.0f;
 878
 879    uint16_t aux16[2];
 880    const uint8_t * s = (const uint8_t *)aux16;
 881
 882    const uint16_t * a = (const uint16_t *)bq4_K->scales;
 883    aux16[0] = a[0] & 0x0f0f;
 884    aux16[1] = (a[0] >> 4) & 0x0f0f;
 885
 886    const float dall = bq4_K->dm[0];
 887    const float dmin = bq4_K->dm[1];
 888
 889    const float d8_1 = bq8_1[0].ds[0];
 890    const float d8_2 = bq8_1[1].ds[1];
 891
 892    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
 893    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
 894    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
 895    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
 896
 897    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
 898    const int v1 = q4[0];
 899    const int v2 = q4[4];
 900
 901    const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
 902    const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
 903    const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
 904    const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
 905
 906    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
 907    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
 908
 909    return dall * sumf_d - dmin * sumf_m;
 910
 911#else
 912    bad_arch();
 913#endif // __SYCL_ARCH__ >= VER_4VEC
 914
 915#endif
 916}
 917
 918static __dpct_inline__ float
 919vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
 920                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 921
 922#ifndef GGML_QKK_64
 923    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
 924
 925    int   vl[2];
 926    int   vh[2];
 927    int    u[2*QR5_K];
 928    float d8[QR5_K];
 929
 930    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
 931    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
 932    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
 933
 934    vl[0] = ql[0];
 935    vl[1] = ql[4];
 936
 937    vh[0] = qh[0] >> bq8_offset;
 938    vh[1] = qh[4] >> bq8_offset;
 939
 940    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
 941    uint16_t aux[2];
 942    const int j = bq8_offset/2;
 943    if (j < 2) {
 944        aux[0] = scales[j+0] & 0x3f3f;
 945        aux[1] = scales[j+2] & 0x3f3f;
 946    } else {
 947        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
 948        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
 949    }
 950    const uint8_t * sc = (const uint8_t *)aux;
 951    const uint8_t * m  = sc + 2;
 952
 953#pragma unroll
 954    for (int i = 0; i < QR5_K; ++i) {
 955        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
 956        d8[i] = bq8i->ds[0];
 957
 958        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
 959        u[2*i+0] = q8[0];
 960        u[2*i+1] = q8[4];
 961    }
 962
 963    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
 964
 965#else
 966
 967#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
 968    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
 969
 970    const int8_t * s = bq5_K->scales;
 971
 972    const float d = bq5_K->d;
 973
 974    const float d8_1 = bq8_1[0].ds[0];
 975    const float d8_2 = bq8_1[1].ds[1];
 976
 977    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
 978    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
 979    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
 980    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
 981
 982    const int * ql = (const int *)bq5_K->qs + (iqs/2);
 983    const int vl1 = ql[0];
 984    const int vl2 = ql[4];
 985
 986    const int step = 4 * (iqs/2); // 0, 4, 8, 12
 987    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
 988    const int in = step%8; // 0, 4, 0, 4
 989    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
 990
 991    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
 992    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
 993    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
 994    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
 995
 996    const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
 997                       + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
 998
 999    return d * sumf_d;
1000
1001#else
1002    bad_arch();
1003#endif // __SYCL_ARCH__ >= VER_4VEC
1004
1005#endif
1006}
1007
1008static __dpct_inline__ float
1009vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
1010                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
1011
1012    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
1013
1014    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1015    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1016    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
1017
1018    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
1019    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
1020
1021    const int8_t * scales = bq6_K->scales + scale_offset;
1022
1023    int    u[QR6_K];
1024    float d8[QR6_K];
1025
1026#pragma unroll
1027    for (int i = 0; i < QR6_K; ++i) {
1028        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
1029        d8[i] = bq8_1[bq8_offset + 2 * i].ds[0];
1030    }
1031
1032    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
1033}
1034
1035
1036static __dpct_inline__ float
1037vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
1038                     const block_q8_1 *__restrict__ bq8_1, const int &iqs,
1039                     const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
1040                     const uint8_t *kmask_iq2xs) {
1041#if QK_K == 256
1042    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
1043
1044    const int ib32 = iqs;
1045    const uint16_t * q2 = bq2->qs + 4*ib32;
1046    const uint8_t  * aux8 = (const uint8_t *)q2;
1047    const int8_t   * q8 = bq8_1[ib32].qs;
1048    uint32_t aux32 = q2[2] | (q2[3] << 16);
1049    int sumi = 0;
1050    for (int l = 0; l < 4; ++l) {
1051        const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
1052        const uint8_t  signs = ksigns_iq2xs[aux32 & 127];
1053        for (int j = 0; j < 8; ++j) {
1054            sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1055        }
1056        q8 += 8;
1057        aux32 >>= 7;
1058    }
1059    const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f;
1060    return d * sumi;
1061#else
1062    assert(false);
1063    return 0.f;
1064#endif
1065}
1066
1067static __dpct_inline__ float
1068vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
1069                    const block_q8_1 *__restrict__ bq8_1, const int &iqs,
1070                    const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
1071#if DPCT_COMPATIBILITY_TEMP >=                                                 \
1072    MIN_CC_DP4A // lowest compute capability for integer intrinsics
1073#if QK_K == 256
1074    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
1075
1076    const int ib32 = iqs;
1077    const uint16_t * q2 = bq2->qs + 4*ib32;
1078    const int8_t   * q8 = bq8_1[ib32].qs;
1079    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
1080    const uint8_t ls2 = bq2->scales[ib32] >>  4;
1081    int sumi1 = 0;
1082    for (int l = 0; l < 2; ++l) {
1083        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
1084        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
1085        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1086            grid[0] ^ signs[0], signs[0], std::minus<>());
1087        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1088            grid[1] ^ signs[1], signs[1], std::minus<>());
1089        sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
1090        sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
1091        q8 += 8;
1092    }
1093    int sumi2 = 0;
1094    for (int l = 2; l < 4; ++l) {
1095        const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
1096        const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
1097        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1098            grid[0] ^ signs[0], signs[0], std::minus<>());
1099        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1100            grid[1] ^ signs[1], signs[1], std::minus<>());
1101        sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
1102        sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
1103        q8 += 8;
1104    }
1105    const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
1106    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
1107#else
1108    assert(false);
1109    return 0.f;
1110#endif
1111#else
1112    assert(false);
1113    return 0.f;
1114#endif
1115}
1116
1117static __dpct_inline__ float
1118vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
1119                   const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
1120#if QK_K == 256
1121    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
1122
1123    const int ib32 = iqs;
1124    const int8_t  * q8 = bq8_1[ib32].qs;
1125    const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
1126    const uint8_t ls1 = bq2->scales[ib32] & 0xf;
1127    const uint8_t ls2 = bq2->scales[ib32] >>  4;
1128    int sumi1 = 0;
1129    for (int l = 0; l < 2; ++l) {
1130        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
1131        const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
1132            ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
1133            std::equal_to<>());
1134        const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
1135            ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
1136            std::equal_to<>());
1137        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1138            grid[0] ^ signs0, signs0, std::minus<>());
1139        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1140            grid[1] ^ signs1, signs1, std::minus<>());
1141        sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
1142        sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
1143        q8 += 8;
1144    }
1145    int sumi2 = 0;
1146    for (int l = 2; l < 4; ++l) {
1147        const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
1148        const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
1149            ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
1150            std::equal_to<>());
1151        const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
1152            ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
1153            std::equal_to<>());
1154        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1155            grid[0] ^ signs0, signs0, std::minus<>());
1156        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1157            grid[1] ^ signs1, signs1, std::minus<>());
1158        sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
1159        sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
1160        q8 += 8;
1161    }
1162    const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
1163    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
1164#else
1165    assert(false);
1166#endif
1167}
1168
1169static __dpct_inline__ float
1170vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
1171                     const block_q8_1 *__restrict__ bq8_1, const int &iqs,
1172                     const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
1173#if DPCT_COMPATIBILITY_TEMP >=                                                 \
1174    MIN_CC_DP4A // lowest compute capability for integer intrinsics
1175#if QK_K == 256
1176    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
1177
1178    const int ib32 = iqs;
1179    const uint8_t  * q3 = bq2->qs + 8*ib32;
1180    const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
1181    const int8_t   * q8 = bq8_1[ib32].qs;
1182    uint32_t aux32 = gas[0] | (gas[1] << 16);
1183    int sumi = 0;
1184    for (int l = 0; l < 4; ++l) {
1185        const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
1186        const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
1187        const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
1188        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1189            grid1[0] ^ signs[0], signs[0], std::minus<>());
1190        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1191            grid2[0] ^ signs[1], signs[1], std::minus<>());
1192        sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
1193        sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
1194        q8 += 8;
1195        aux32 >>= 7;
1196    }
1197    const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.5f;
1198    return d * sumi;
1199#else
1200    assert(false);
1201    return 0.f;
1202#endif
1203#else
1204    assert(false);
1205    return 0.f;
1206#endif
1207}
1208
1209static __dpct_inline__ float
1210vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
1211                   const block_q8_1 *__restrict__ bq8_1, const int &iqs,
1212                   const uint32_t *iq3s_grid) {
1213#if QK_K == 256
1214    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
1215
1216    const int ib32 = iqs;
1217    const uint8_t  * qs = bq2->qs + 8*ib32;
1218    const int8_t   * q8 = bq8_1[ib32].qs;
1219    int sumi = 0;
1220    for (int l = 0; l < 4; ++l) {
1221        const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
1222        const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
1223        uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
1224            ((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201,
1225            0x08040201, std::equal_to<>());
1226        uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
1227            ((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201,
1228            0x08040201, std::equal_to<>());
1229        const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1230            grid1[0] ^ signs0, signs0, std::minus<>());
1231        const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1232            grid2[0] ^ signs1, signs1, std::minus<>());
1233        sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
1234        sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
1235        q8 += 8;
1236    }
1237    const float d =
1238        (float)bq2->d *
1239        (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
1240        bq8_1[ib32].ds[0];
1241    return d * sumi;
1242#else
1243    assert(false);
1244#endif
1245}
1246
1247static __dpct_inline__ float
1248vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
1249                   const block_q8_1 *__restrict__ bq8_1, const int &iqs,
1250                   const uint32_t *iq1s_grid_gpu) {
1251#if QK_K == 256
1252    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
1253
1254    const int ib32 = iqs;
1255    int sumi = 0;
1256    const int * q8 = (const int *)bq8_1[ib32].qs;
1257    for (int l = 0; l < 4; ++l) {
1258        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
1259        int grid0 = grid[0] & 0x0f0f0f0f;
1260        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
1261        sumi = dpct::dp4a(q8[2 * l + 1], grid1,
1262                          dpct::dp4a(q8[2 * l + 0], grid0, sumi));
1263    }
1264
1265    const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
1266    const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
1267    const float d = d1q * bq8_1[ib32].ds[0];
1268    const float m = d1q * bq8_1[ib32].ds[1];
1269    return d * sumi + m * delta;
1270#else
1271    assert(false);
1272#endif
1273}
1274
1275static __dpct_inline__ float
1276vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
1277                   const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
1278#if QK_K == 256
1279    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
1280
1281    const int ib32 = iqs;
1282    int   sumi[2] = {0, 0};
1283    float sumf[2] = {0.f, 0.f};
1284
1285    const int * q8 = (const int *)bq8_1[ib32].qs;
1286    for (int l = 0; l < 4; ++l) {
1287        const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
1288        int grid0 = grid[0] & 0x0f0f0f0f;
1289        int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
1290        sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1,
1291                                 dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2]));
1292        const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
1293        const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101,
1294                                    dpct::dp4a(q8[2 * l + 0], 0x01010101, 0));
1295        sumf[l/2] += delta*sumy;
1296    }
1297
1298    iq1m_scale_t scale;
1299    const uint16_t * sc = (const uint16_t *)bq1->scales;
1300    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
1301    const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
1302    return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
1303#else
1304    assert(false);
1305#endif
1306}
1307
1308
1309static __dpct_inline__ float
1310vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
1311                    const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
1312
1313    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
1314
1315    const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
1316    const int32_t  * q8 = (const int32_t  *)bq8_1->qs + iqs;
1317
1318    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
1319
1320    int v1, v2;
1321    int sumi1 = 0, sumi2 = 0;
1322    for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
1323        const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
1324        get_int_from_table_16(aux, values, v1, v2);
1325        sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1);
1326        sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2);
1327    }
1328
1329    const float d = (float)bq->d * bq8_1->ds[0];
1330    return d * (sumi1 + sumi2);
1331}
1332
1333
1334static __dpct_inline__ float
1335vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
1336                    const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
1337
1338#if QK_K == 256
1339    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
1340    const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
1341
1342    // iqs is 0...7
1343    const int ib32 = iqs;
1344    const int32_t  * q8 = (const int *)bq8_1[ib32].qs;
1345    const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
1346    const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
1347    const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0];
1348    int v1, v2;
1349    int sumi1 = 0, sumi2 = 0;
1350    for (int j = 0; j < 4; ++j) {
1351        get_int_from_table_16(q4[j], values, v1, v2);
1352        sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1);
1353        sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
1354    }
1355    return d * (sumi1 + sumi2);
1356#else
1357    assert(false);
1358#endif
1359}
1360
1361#endif // GGML_SYCL_VECDOTQ_HPP