1//
2// MIT license
3// Copyright (C) 2025 Intel Corporation
4// SPDX-License-Identifier: MIT
5//
6
7//
8// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9// See https://llvm.org/LICENSE.txt for license information.
10// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11//
12
13#ifndef GGML_SYCL_VECDOTQ_HPP
14#define GGML_SYCL_VECDOTQ_HPP
15
16#include "dpct/helper.hpp"
17#include "ggml.h"
18#include "quants.hpp"
19
20typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
21 const int & iqs);
22
23static __dpct_inline__ int get_int_b1(const void * x, const int & i32) {
24 const uint8_t * x8 = (const uint8_t *) x;
25
26 int x32 = x8[4*i32 + 0] << 0;
27 x32 |= x8[4*i32 + 1] << 8;
28 x32 |= x8[4*i32 + 2] << 16;
29 x32 |= x8[4*i32 + 3] << 24;
30
31 return x32;
32}
33
34
35static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
36 const uint16_t* x16 =
37 (const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte
38 // alignment
39
40 int x32 = 0;
41 x32 |= x16[0] << 0;
42 x32 |= x16[1] << 16;
43
44 return x32;
45}
46
47static __dpct_inline__ int get_int_from_uint8(
48 const uint8_t* x8,
49 const int& i32) {
50 const uint16_t* x16 =
51 (const uint16_t*)(x8 + sizeof(int) * i32); // assume at least 2 byte
52 // alignment
53
54 int x32 = 0;
55 x32 |= x16[0] << 0;
56 x32 |= x16[1] << 16;
57
58 return x32;
59}
60
61static __dpct_inline__ int get_int_from_int8_aligned(
62 const int8_t* x8,
63 const int& i32) {
64 return *(
65 (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
66}
67
68static __dpct_inline__ int get_int_from_uint8_aligned(
69 const uint8_t* x8,
70 const int& i32) {
71 return *(
72 (const int*)(x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
73}
74
75static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4,
76 const uint8_t *values,
77 int &val1, int &val2) {
78
79 uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32;
80 aux32 = q4 & 0x0f0f0f0f;
81 uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8);
82 uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8);
83 val1 = v1 | (v2 << 16);
84 aux32 = (q4 >> 4) & 0x0f0f0f0f;
85 v1 = values[q8[0]] | (values[q8[1]] << 8);
86 v2 = values[q8[2]] | (values[q8[3]] << 8);
87 val2 = v1 | (v2 << 16);
88}
89
90static __dpct_inline__ sycl::int2 get_int_from_table_16(
91 const int& q4, const int8_t* table) {
92 const uint32_t* table32 = (const uint32_t*)table;
93 uint32_t tmp[2];
94 const uint32_t low_high_selection_indices =
95 (0x32103210 | ((q4 & 0x88888888) >> 1));
96#pragma unroll
97 for (uint32_t i = 0; i < 2; ++i) {
98 const uint32_t shift = 16 * i;
99
100 const uint32_t low =
101 dpct::byte_level_permute(table32[0], table32[1], q4 >> shift);
102 const uint32_t high =
103 dpct::byte_level_permute(table32[2], table32[3], q4 >> shift);
104 tmp[i] = dpct::byte_level_permute(
105 low, high, low_high_selection_indices >> shift);
106 }
107 return sycl::int2(
108 dpct::byte_level_permute(tmp[0], tmp[1], 0x6420),
109 dpct::byte_level_permute(tmp[0], tmp[1], 0x7531));
110}
111
112#define VDR_Q2_K_Q8_1_MMVQ 1
113
114// contiguous v/x values
115static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
116 const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales,
117 const sycl::half2 &dm2, const float *__restrict__ d8) {
118
119 float sumf_d = 0.0f;
120 float sumf_m = 0.0f;
121
122#pragma unroll
123 for (int i = 0; i < QR2_K; ++i) {
124 const int sc = scales[2*i];
125
126 const int vi = (v >> (2*i)) & 0x03030303;
127
128 sumf_d +=
129 d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
130
131 // fill int with 4x m
132 int m = sc >> 4;
133 m |= m << 8;
134 m |= m << 16;
135 sumf_m += d8[i] *
136 dpct::dp4a(
137 m, u[i],
138 0); // multiply constant q2_K part with sum of q8_1 values
139 }
140
141 const sycl::float2 dm2f =
142 dm2.convert<float, sycl::rounding_mode::automatic>();
143
144 return dm2f.x() * sumf_d - dm2f.y() * sumf_m;
145}
146
147
148#define VDR_Q3_K_Q8_1_MMVQ 1
149
150// contiguous v/x values
151static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
152 const int &vl, const int &vh, const int *__restrict__ u,
153 const uint8_t *__restrict__ scales, const int &scale_offset,
154 const float &d3, const float *__restrict__ d8) {
155
156 float sumf = 0.0f;
157
158#pragma unroll
159 for (int i = 0; i < QR3_K; ++i) {
160 const int isc = scale_offset + 2*i;
161
162 const int isc_low = isc % (QK_K/32);
163 const int sc_shift_low = 4 * (isc / (QK_K/32));
164 const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
165
166 const int isc_high = isc % (QK_K/64);
167 const int sc_shift_high = 2 * (isc / (QK_K/64));
168 const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
169
170 const int sc = (sc_low | sc_high) - 32;
171
172 const int vil = (vl >> (2*i)) & 0x03030303;
173
174 const int vih = ((vh >> i) << 2) & 0x04040404;
175
176 const int vi =
177 dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat());
178
179 sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
180 }
181
182 return d3 * sumf;
183}
184
185#define VDR_Q4_K_Q8_1_MMVQ 2
186
187// contiguous v/x values
188static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
189 const int *__restrict__ v, const int *__restrict__ u,
190 const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
191 const sycl::half2 &dm4, const float *__restrict__ d8) {
192
193 float sumf_d = 0.0f;
194 float sumf_m = 0.0f;
195
196#pragma unroll
197 for (int i = 0; i < QR4_K; ++i) {
198 const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
199 const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
200
201 const int dot1 =
202 dpct::dp4a(v1i, u[2 * i + 1],
203 dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product
204 const int dot2 =
205 dpct::dp4a(0x01010101, u[2 * i + 1],
206 dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u
207
208 sumf_d += d8[i] * (dot1 * sc[i]);
209 sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
210 }
211
212 const sycl::float2 dm4f =
213 dm4.convert<float, sycl::rounding_mode::automatic>();
214
215 return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
216}
217
218
219#define VDR_Q5_K_Q8_1_MMVQ 2
220
221// contiguous v/x values
222static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
223 const int *__restrict__ vl, const int *__restrict__ vh,
224 const int *__restrict__ u, const uint8_t *__restrict__ sc,
225 const uint8_t *__restrict__ m, const sycl::half2 &dm5,
226 const float *__restrict__ d8) {
227
228 float sumf_d = 0.0f;
229 float sumf_m = 0.0f;
230
231#pragma unroll
232 for (int i = 0; i < QR5_K; ++i) {
233 const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
234 const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
235
236 const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
237 const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
238
239 const int v0i = vl0i | vh0i;
240 const int v1i = vl1i | vh1i;
241
242 const int dot1 =
243 dpct::dp4a(v0i, u[2 * i + 0],
244 dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product
245 const int dot2 =
246 dpct::dp4a(0x01010101, u[2 * i + 0],
247 dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u
248
249 sumf_d += d8[i] * (dot1 * sc[i]);
250 sumf_m += d8[i] * (dot2 * m[i]);
251
252 }
253
254 const sycl::float2 dm5f =
255 dm5.convert<float, sycl::rounding_mode::automatic>();
256
257 return dm5f.x() * sumf_d - dm5f.y() * sumf_m;
258}
259
260
261#define VDR_Q6_K_Q8_1_MMVQ 1
262
263// contiguous v/x values
264static __dpct_inline__ float
265vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
266 const int *__restrict__ u,
267 const int8_t *__restrict__ scales, const float &d,
268 const float *__restrict__ d8) {
269
270 float sumf = 0.0f;
271
272#pragma unroll
273 for (int i = 0; i < QR6_K; ++i) {
274 const int sc = scales[4*i];
275
276 const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
277
278 const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
279
280 const int vi = dpct::vectorized_binary<sycl::char4>(
281 (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32
282
283 sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
284 }
285
286 return d*sumf;
287}
288
289// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
290// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
291
292template <ggml_type T> struct reorder_vec_dot_q_sycl {
293 static_assert(T != T, "ggml_type for reorder vecdot not implemented");
294};
295
296template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
297 static constexpr ggml_type gtype = GGML_TYPE_Q4_0;
298
299 using q4_0_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_0>;
300 using q4_0_traits = typename q4_0_block::traits;
301
302 __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4, const sycl::half2 & ds8) {
303 int sumi = 0;
304
305#pragma unroll
306 for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
307 const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
308 const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
309
310 // SIMD dot product of quantized values
311 sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
312 sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
313 }
314
315 const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
316
317 // second part effectively subtracts 8 from each quant value
318 return d4 * (sumi * ds8f.x() - (8 * q4_0_traits::vdr_mmvq / q4_0_traits::qi) * ds8f.y());
319 }
320
321 __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
322 const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
323 const sycl::half2 * q8_1_ds, const int & iqs) {
324 const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset.first;
325 const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
326 int v[q4_0_traits::vdr_mmvq];
327 int u[2 * q4_0_traits::vdr_mmvq];
328
329
330#pragma unroll
331 for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
332 v[i] = get_int_from_uint8(bq4_0, iqs + i);
333 u[2 * i + 0] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
334 u[2 * i + 1] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i + q4_0_traits::qi);
335 }
336
337 return vec_dot_q4_0_q8_1_impl(v, u, d, *q8_1_ds);
338 };
339};
340
341static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
342 const ggml_half2 & dm, const block_q8_1 * __restrict__ bq8_1,
343 const int & iqs) {
344 int v[2];
345 int u[2 * QR4_K];
346 float d8[QR4_K];
347
348 v[0] = q4[0];
349 v[1] = q4[4];
350
351 uint16_t aux[2];
352 const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
353 if (j < 2) {
354 aux[0] = scales[j + 0] & 0x3f3f;
355 aux[1] = scales[j + 2] & 0x3f3f;
356 } else {
357 aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
358 aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
359 }
360
361 const uint8_t * sc = (const uint8_t *) aux;
362 const uint8_t * m = sc + 2;
363
364 const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
365
366 for (int i = 0; i < QR4_K; ++i) {
367 const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
368 d8[i] = bq8i->ds[0];
369
370 const int * q8 = (const int *) bq8i->qs + ((iqs / 2) % 4);
371 u[2 * i + 0] = q8[0];
372 u[2 * i + 1] = q8[4];
373 }
374
375 return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, dm, d8);
376}
377
378template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
379 static constexpr ggml_type gtype = GGML_TYPE_Q4_K;
380
381 using q4_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_K>;
382 using q4_k_traits = typename q4_k_block::traits;
383
384 __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
385 const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
386 const sycl::half2 * q8_1_ds, const int & iqs) {
387 const uint8_t * base = static_cast<const uint8_t *>(vbq);
388 const uint8_t * qs = base + ibx_offset.first;
389 const uint8_t * scs = base + d_offset.first;
390 const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
391
392 const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
393 const int * q4 = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
394 const uint16_t * scales = (const uint16_t *) scs;
395
396 int v[2];
397 int u[2 * QR4_K];
398 float d8[QR4_K];
399
400 v[0] = q4[0];
401 v[1] = q4[4];
402
403 uint16_t aux[2];
404 const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
405 if (j < 2) {
406 aux[0] = scales[j + 0] & 0x3f3f;
407 aux[1] = scales[j + 2] & 0x3f3f;
408 } else {
409 aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
410 aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
411 }
412
413 const uint8_t * sc = (const uint8_t *) aux;
414 const uint8_t * m = sc + 2;
415
416 for (int i = 0; i < QR4_K; ++i) {
417 const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
418 sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
419
420 d8[i] = ds_values[0];
421
422 const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
423 u[2 * i + 0] = q8[0];
424 u[2 * i + 1] = q8[4];
425 }
426
427 return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, *dms, d8);
428 }
429};
430
431template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
432 static constexpr ggml_type gtype = GGML_TYPE_Q6_K;
433
434 using q6_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q6_K>;
435 using q6_k_traits = typename q6_k_block::traits;
436
437 __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(const int vl, const int vh, const int * __restrict__ u,
438 const int8_t * __restrict__ scales, const float d,
439 const float * __restrict__ d8) {
440 float sumf = 0.0f;
441
442#pragma unroll
443 for (int i = 0; i < QR6_K; ++i) {
444 const int sc = scales[4 * i];
445
446 const int vil = (vl >> (4 * i)) & 0x0F0F0F0F;
447
448 const int vih = ((vh >> (4 * i)) << 4) & 0x30303030;
449
450 const int vi = dpct::vectorized_binary<sycl::char4>((vil | vih), 0x20202020,
451 dpct::sub_sat()); // vi = (vil | vih) - 32
452
453 sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
454 }
455
456 return d * sumf;
457 }
458
459 __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
460 const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds,
461 const int iqs) {
462 const uint8_t * base = static_cast<const uint8_t *>(vbq);
463 const uint8_t * ql = base + ibx_offset.first;
464 const uint8_t * qh = base + ibx_offset.second;
465 const int8_t * scales = reinterpret_cast<const int8_t *>(base + d_offset.first);
466 const ggml_half * d = (const ggml_half *) (base + d_offset.second);
467
468 const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4);
469 const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8);
470 const int vh_shift = 2 * ((iqs % (QI6_K / 2)) / (QI6_K / 4));
471
472 const int vl = get_int_from_uint8(ql, iqs);
473 const int vh = get_int_from_uint8(qh, (QI6_K / 4) * (iqs / (QI6_K / 2)) + iqs % (QI6_K / 4)) >> vh_shift;
474
475 const int8_t * scs = scales + scale_offset;
476
477 int u[QR6_K];
478 float d8[QR6_K];
479
480#pragma unroll
481 for (int i = 0; i < QR6_K; ++i) {
482 u[i] = get_int_from_int8_aligned(q8_1_quant_ptr + (bq8_offset + 2 * i) * QK8_1, iqs % QI8_1);
483 const sycl::half2 ds_values = *(q8_1_ds + bq8_offset + 2 * i);
484 d8[i] = ds_values[0];
485 }
486 return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scs, *d, d8);
487 }
488};
489#define VDR_Q4_0_Q8_1_MMVQ 2
490#define VDR_Q4_0_Q8_1_MMQ 4
491
492template <int vdr>
493static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4,
494 const sycl::half2 & ds8) {
495 int sumi = 0;
496#pragma unroll
497 for (int i = 0; i < vdr; ++i) {
498 const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
499 const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
500
501 // SIMD dot product of quantized values
502 sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
503 sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
504 }
505
506 const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
507
508 // second part effectively subtracts 8 from each quant value
509 return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
510}
511
512#define VDR_Q4_1_Q8_1_MMVQ 2
513#define VDR_Q4_1_Q8_1_MMQ 4
514
515template <int vdr>
516static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
517 const sycl::half2 &dm4,
518 const sycl::half2 &ds8) {
519
520 int sumi = 0;
521
522#pragma unroll
523 for (int i = 0; i < vdr; ++i) {
524 const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
525 const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
526
527 // SIMD dot product of quantized values
528 sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
529 sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
530 }
531
532#ifdef GGML_SYCL_F16
533 const sycl::float2 tmp =
534 (dm4 * ds8).convert<float, sycl::rounding_mode::automatic>();
535 const float d4d8 = tmp.x();
536 const float m4s8 = tmp.y();
537#else
538 const sycl::float2 dm4f =
539 dm4.convert<float, sycl::rounding_mode::automatic>();
540 const sycl::float2 ds8f =
541 ds8.convert<float, sycl::rounding_mode::automatic>();
542 const float d4d8 = dm4f.x() * ds8f.x();
543 const float m4s8 = dm4f.y() * ds8f.y();
544#endif // GGML_SYCL_F16
545
546 // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
547 return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
548}
549
550#define VDR_Q5_0_Q8_1_MMVQ 2
551#define VDR_Q5_0_Q8_1_MMQ 4
552
553template <int vdr>
554static __dpct_inline__ float
555vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
556 const float &d5, const sycl::half2 &ds8) {
557 int sumi = 0;
558
559#pragma unroll
560 for (int i = 0; i < vdr; ++i) {
561 int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
562 vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
563 vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
564 vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
565 vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
566 sumi = dpct::dp4a(vi0, u[2 * i + 0],
567 sumi); // SIMD dot product of quantized values
568
569 int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
570 vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
571 vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
572 vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
573 vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
574 sumi = dpct::dp4a(vi1, u[2 * i + 1],
575 sumi); // SIMD dot product of quantized values
576 }
577
578 const sycl::float2 ds8f =
579 ds8.convert<float, sycl::rounding_mode::automatic>();
580
581 // second part effectively subtracts 16 from each quant value
582 return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y());
583}
584
585#define VDR_Q5_1_Q8_1_MMVQ 2
586#define VDR_Q5_1_Q8_1_MMQ 4
587
588template <int vdr>
589static __dpct_inline__ float
590vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
591 const sycl::half2 &dm5, const sycl::half2 &ds8) {
592
593 int sumi = 0;
594
595#pragma unroll
596 for (int i = 0; i < vdr; ++i) {
597 int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
598 vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
599 vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
600 vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
601 vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
602 sumi = dpct::dp4a(vi0, u[2 * i + 0],
603 sumi); // SIMD dot product of quantized values
604
605 int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
606 vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
607 vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
608 vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
609 vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
610 sumi = dpct::dp4a(vi1, u[2 * i + 1],
611 sumi); // SIMD dot product of quantized values
612 }
613
614#ifdef GGML_SYCL_F16
615 const sycl::float2 tmp =
616 (dm5 * ds8).convert<float, sycl::rounding_mode::automatic>();
617 const float d5d8 = tmp.x();
618 const float m5s8 = tmp.y();
619
620
621#else
622 const sycl::float2 dm5f =
623 dm5.convert<float, sycl::rounding_mode::automatic>();
624 const sycl::float2 ds8f =
625 ds8.convert<float, sycl::rounding_mode::automatic>();
626 const float d5d8 = dm5f.x() * ds8f.x();
627 const float m5s8 = dm5f.y() * ds8f.y();
628#endif // GGML_SYCL_F16
629
630 // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
631 return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
632}
633
634#define VDR_Q8_0_Q8_1_MMVQ 2
635#define VDR_Q8_0_Q8_1_MMQ 8
636
637template <int vdr>
638static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
639 const float &d8_0,
640 const float &d8_1) {
641
642 int sumi = 0;
643
644#pragma unroll
645 for (int i = 0; i < vdr; ++i) {
646 // SIMD dot product of quantized values
647 sumi = dpct::dp4a(v[i], u[i], sumi);
648 }
649
650 return d8_0*d8_1 * sumi;
651}
652
653template <int vdr>
654static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
655 const sycl::half2 &dm8,
656 const sycl::half2 &ds8) {
657
658 int sumi = 0;
659
660#pragma unroll
661 for (int i = 0; i < vdr; ++i) {
662 // SIMD dot product of quantized values
663 sumi = dpct::dp4a(v[i], u[i], sumi);
664 }
665
666#ifdef GGML_SYCL_F16
667 const sycl::float2 tmp =
668 (dm8 * ds8).convert<float, sycl::rounding_mode::automatic>();
669 const float d8d8 = tmp.x();
670 const float m8s8 = tmp.y();
671#else
672 const sycl::float2 dm8f =
673 dm8.convert<float, sycl::rounding_mode::automatic>();
674 const sycl::float2 ds8f =
675 ds8.convert<float, sycl::rounding_mode::automatic>();
676 const float d8d8 = dm8f.x() * ds8f.x();
677 const float m8s8 = dm8f.y() * ds8f.y();
678#endif // GGML_SYCL_F16
679
680 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
681 return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
682}
683
684static __dpct_inline__ float
685vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
686 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
687
688 const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
689
690 int v[VDR_Q4_0_Q8_1_MMVQ];
691 int u[2 * VDR_Q4_0_Q8_1_MMVQ];
692
693#pragma unroll
694 for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
695 v[i] = get_int_from_uint8(bq4_0->qs, iqs + i);
696 u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
697 u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
698 }
699
700 return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
701}
702
703static __dpct_inline__ float
704vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
705 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
706
707 const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
708
709 int v[VDR_Q4_1_Q8_1_MMVQ];
710 int u[2*VDR_Q4_1_Q8_1_MMVQ];
711
712#pragma unroll
713 for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
714 v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
715 u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
716 u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
717 }
718
719 return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
720}
721
722#define VDR_MXFP4_Q8_1_MMVQ 2
723#define VDR_MXFP4_Q8_1_MMQ 4
724
725static __dpct_inline__ float vec_dot_mxfp4_q8_1(const void * __restrict__ vbq,
726 const block_q8_1 * __restrict__ bq8_1,
727 const int & iqs) {
728 const block_mxfp4 * bq4 = (const block_mxfp4 *) vbq;
729
730 const int * q8 = (const int *) bq8_1->qs + iqs;
731
732 int sumi = 0;
733#pragma unroll
734 for (int l = 0; l < VDR_MXFP4_Q8_1_MMVQ; ++l) {
735 const int aux_q4 = get_int_b1(bq4->qs, iqs + l);
736 const sycl::int2 v = get_int_from_table_16(aux_q4, kvalues_mxfp4);
737 sumi = ggml_sycl_dp4a(v.x(), q8[l + 0], sumi);
738 sumi = ggml_sycl_dp4a(v.y(), q8[l + 4], sumi);
739 }
740
741 const float d = ggml_sycl_e8m0_to_fp32(bq4->e) * 0.5f * (bq8_1->ds)[0];
742 return d * sumi;
743}
744
745
746static __dpct_inline__ float
747vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
748 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
749
750 const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
751
752 int vl[VDR_Q5_0_Q8_1_MMVQ];
753 int vh[VDR_Q5_0_Q8_1_MMVQ];
754 int u[2*VDR_Q5_0_Q8_1_MMVQ];
755
756#pragma unroll
757 for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
758 vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i);
759 vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
760 u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
761 u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
762 }
763
764 return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
765}
766
767static __dpct_inline__ float
768vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
769 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
770
771 const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
772
773 int vl[VDR_Q5_1_Q8_1_MMVQ];
774 int vh[VDR_Q5_1_Q8_1_MMVQ];
775 int u[2*VDR_Q5_1_Q8_1_MMVQ];
776
777#pragma unroll
778 for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
779 vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
780 vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
781 u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
782 u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
783 }
784
785 return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
786}
787
788static __dpct_inline__ float
789vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
790 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
791
792 const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
793
794 int v[VDR_Q8_0_Q8_1_MMVQ];
795 int u[VDR_Q8_0_Q8_1_MMVQ];
796
797#pragma unroll
798 for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
799 v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
800 u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
801 }
802
803 return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d,
804 bq8_1->ds[0]);
805}
806
807static __dpct_inline__ float
808vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
809 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
810
811 const block_q2_K * bq2_K = (const block_q2_K *) vbq;
812
813 const int bq8_offset = QR2_K * (iqs / QI8_1);
814 const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
815
816 const uint8_t * scales = bq2_K->scales + scale_offset;
817
818 const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
819 int u[QR2_K];
820 float d8[QR2_K];
821
822#pragma unroll
823 for (int i = 0; i < QR2_K; ++ i) {
824 u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
825 d8[i] = bq8_1[bq8_offset + i].ds[0];
826 }
827
828 return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
829}
830
831static __dpct_inline__ float
832vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
833 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
834
835 const block_q3_K * bq3_K = (const block_q3_K *) vbq;
836
837 const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
838 const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
839
840 const float d = bq3_K->d;
841
842 const int vl = get_int_from_uint8(bq3_K->qs, iqs);
843
844 // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
845 const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
846
847 int u[QR3_K];
848 float d8[QR3_K];
849
850#pragma unroll
851 for (int i = 0; i < QR3_K; ++i) {
852 u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
853 d8[i] = bq8_1[bq8_offset + i].ds[0];
854 }
855
856 return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
857}
858
859static __dpct_inline__ float vec_dot_q4_K_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
860 const int & iqs) {
861#ifndef GGML_QKK_64
862
863 const block_q4_K * bq4_K = (const block_q4_K *) vbq;
864
865 const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
866 const int * q4 = (const int *) (bq4_K->qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
867 const uint16_t * scales = (const uint16_t *) bq4_K->scales;
868
869 return vec_dot_q4_K_q8_1_common(q4, scales, bq4_K->dm, bq8_1, iqs);
870
871#else
872
873#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
874 const block_q4_K * bq4_K = (const block_q4_K *) vbq;
875
876 float sumf_d = 0.0f;
877 float sumf_m = 0.0f;
878
879 uint16_t aux16[2];
880 const uint8_t * s = (const uint8_t *)aux16;
881
882 const uint16_t * a = (const uint16_t *)bq4_K->scales;
883 aux16[0] = a[0] & 0x0f0f;
884 aux16[1] = (a[0] >> 4) & 0x0f0f;
885
886 const float dall = bq4_K->dm[0];
887 const float dmin = bq4_K->dm[1];
888
889 const float d8_1 = bq8_1[0].ds[0];
890 const float d8_2 = bq8_1[1].ds[1];
891
892 const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
893 const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
894 const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
895 const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
896
897 const int * q4 = (const int *)bq4_K->qs + (iqs/2);
898 const int v1 = q4[0];
899 const int v2 = q4[4];
900
901 const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
902 const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
903 const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
904 const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
905
906 sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
907 sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
908
909 return dall * sumf_d - dmin * sumf_m;
910
911#else
912 bad_arch();
913#endif // __SYCL_ARCH__ >= VER_4VEC
914
915#endif
916}
917
918static __dpct_inline__ float
919vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
920 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
921
922#ifndef GGML_QKK_64
923 const block_q5_K * bq5_K = (const block_q5_K *) vbq;
924
925 int vl[2];
926 int vh[2];
927 int u[2*QR5_K];
928 float d8[QR5_K];
929
930 const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
931 const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
932 const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
933
934 vl[0] = ql[0];
935 vl[1] = ql[4];
936
937 vh[0] = qh[0] >> bq8_offset;
938 vh[1] = qh[4] >> bq8_offset;
939
940 const uint16_t * scales = (const uint16_t *)bq5_K->scales;
941 uint16_t aux[2];
942 const int j = bq8_offset/2;
943 if (j < 2) {
944 aux[0] = scales[j+0] & 0x3f3f;
945 aux[1] = scales[j+2] & 0x3f3f;
946 } else {
947 aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
948 aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
949 }
950 const uint8_t * sc = (const uint8_t *)aux;
951 const uint8_t * m = sc + 2;
952
953#pragma unroll
954 for (int i = 0; i < QR5_K; ++i) {
955 const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
956 d8[i] = bq8i->ds[0];
957
958 const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
959 u[2*i+0] = q8[0];
960 u[2*i+1] = q8[4];
961 }
962
963 return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
964
965#else
966
967#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
968 const block_q5_K * bq5_K = (const block_q5_K *) vbq;
969
970 const int8_t * s = bq5_K->scales;
971
972 const float d = bq5_K->d;
973
974 const float d8_1 = bq8_1[0].ds[0];
975 const float d8_2 = bq8_1[1].ds[1];
976
977 const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
978 const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
979 const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
980 const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
981
982 const int * ql = (const int *)bq5_K->qs + (iqs/2);
983 const int vl1 = ql[0];
984 const int vl2 = ql[4];
985
986 const int step = 4 * (iqs/2); // 0, 4, 8, 12
987 const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
988 const int in = step%8; // 0, 4, 0, 4
989 const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
990
991 const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
992 const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
993 const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
994 const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
995
996 const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
997 + d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
998
999 return d * sumf_d;
1000
1001#else
1002 bad_arch();
1003#endif // __SYCL_ARCH__ >= VER_4VEC
1004
1005#endif
1006}
1007
1008static __dpct_inline__ float
1009vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
1010 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
1011
1012 const block_q6_K * bq6_K = (const block_q6_K *) vbq;
1013
1014 const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
1015 const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
1016 const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
1017
1018 const int vl = get_int_from_uint8(bq6_K->ql, iqs);
1019 const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
1020
1021 const int8_t * scales = bq6_K->scales + scale_offset;
1022
1023 int u[QR6_K];
1024 float d8[QR6_K];
1025
1026#pragma unroll
1027 for (int i = 0; i < QR6_K; ++i) {
1028 u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
1029 d8[i] = bq8_1[bq8_offset + 2 * i].ds[0];
1030 }
1031
1032 return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
1033}
1034
1035
1036static __dpct_inline__ float
1037vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
1038 const block_q8_1 *__restrict__ bq8_1, const int &iqs,
1039 const uint64_t *iq2xxs_grid, const uint8_t *ksigns_iq2xs,
1040 const uint8_t *kmask_iq2xs) {
1041#if QK_K == 256
1042 const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
1043
1044 const int ib32 = iqs;
1045 const uint16_t * q2 = bq2->qs + 4*ib32;
1046 const uint8_t * aux8 = (const uint8_t *)q2;
1047 const int8_t * q8 = bq8_1[ib32].qs;
1048 uint32_t aux32 = q2[2] | (q2[3] << 16);
1049 int sumi = 0;
1050 for (int l = 0; l < 4; ++l) {
1051 const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
1052 const uint8_t signs = ksigns_iq2xs[aux32 & 127];
1053 for (int j = 0; j < 8; ++j) {
1054 sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1055 }
1056 q8 += 8;
1057 aux32 >>= 7;
1058 }
1059 const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f;
1060 return d * sumi;
1061#else
1062 assert(false);
1063 return 0.f;
1064#endif
1065}
1066
1067static __dpct_inline__ float
1068vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
1069 const block_q8_1 *__restrict__ bq8_1, const int &iqs,
1070 const uint64_t *iq2xs_grid, const uint64_t *ksigns64) {
1071#if DPCT_COMPATIBILITY_TEMP >= \
1072 MIN_CC_DP4A // lowest compute capability for integer intrinsics
1073#if QK_K == 256
1074 const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
1075
1076 const int ib32 = iqs;
1077 const uint16_t * q2 = bq2->qs + 4*ib32;
1078 const int8_t * q8 = bq8_1[ib32].qs;
1079 const uint8_t ls1 = bq2->scales[ib32] & 0xf;
1080 const uint8_t ls2 = bq2->scales[ib32] >> 4;
1081 int sumi1 = 0;
1082 for (int l = 0; l < 2; ++l) {
1083 const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
1084 const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
1085 const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1086 grid[0] ^ signs[0], signs[0], std::minus<>());
1087 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1088 grid[1] ^ signs[1], signs[1], std::minus<>());
1089 sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
1090 sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
1091 q8 += 8;
1092 }
1093 int sumi2 = 0;
1094 for (int l = 2; l < 4; ++l) {
1095 const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
1096 const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
1097 const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1098 grid[0] ^ signs[0], signs[0], std::minus<>());
1099 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1100 grid[1] ^ signs[1], signs[1], std::minus<>());
1101 sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
1102 sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
1103 q8 += 8;
1104 }
1105 const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
1106 return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
1107#else
1108 assert(false);
1109 return 0.f;
1110#endif
1111#else
1112 assert(false);
1113 return 0.f;
1114#endif
1115}
1116
1117static __dpct_inline__ float
1118vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
1119 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
1120#if QK_K == 256
1121 const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
1122
1123 const int ib32 = iqs;
1124 const int8_t * q8 = bq8_1[ib32].qs;
1125 const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32;
1126 const uint8_t ls1 = bq2->scales[ib32] & 0xf;
1127 const uint8_t ls2 = bq2->scales[ib32] >> 4;
1128 int sumi1 = 0;
1129 for (int l = 0; l < 2; ++l) {
1130 const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
1131 const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
1132 ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
1133 std::equal_to<>());
1134 const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
1135 ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
1136 std::equal_to<>());
1137 const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1138 grid[0] ^ signs0, signs0, std::minus<>());
1139 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1140 grid[1] ^ signs1, signs1, std::minus<>());
1141 sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
1142 sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
1143 q8 += 8;
1144 }
1145 int sumi2 = 0;
1146 for (int l = 2; l < 4; ++l) {
1147 const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300)));
1148 const uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
1149 ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201,
1150 std::equal_to<>());
1151 const uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
1152 ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201,
1153 std::equal_to<>());
1154 const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1155 grid[0] ^ signs0, signs0, std::minus<>());
1156 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1157 grid[1] ^ signs1, signs1, std::minus<>());
1158 sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
1159 sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
1160 q8 += 8;
1161 }
1162 const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
1163 return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
1164#else
1165 assert(false);
1166#endif
1167}
1168
1169static __dpct_inline__ float
1170vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
1171 const block_q8_1 *__restrict__ bq8_1, const int &iqs,
1172 const uint32_t *iq3xxs_grid, const uint64_t *ksigns64) {
1173#if DPCT_COMPATIBILITY_TEMP >= \
1174 MIN_CC_DP4A // lowest compute capability for integer intrinsics
1175#if QK_K == 256
1176 const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
1177
1178 const int ib32 = iqs;
1179 const uint8_t * q3 = bq2->qs + 8*ib32;
1180 const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
1181 const int8_t * q8 = bq8_1[ib32].qs;
1182 uint32_t aux32 = gas[0] | (gas[1] << 16);
1183 int sumi = 0;
1184 for (int l = 0; l < 4; ++l) {
1185 const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
1186 const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
1187 const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
1188 const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1189 grid1[0] ^ signs[0], signs[0], std::minus<>());
1190 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1191 grid2[0] ^ signs[1], signs[1], std::minus<>());
1192 sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
1193 sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
1194 q8 += 8;
1195 aux32 >>= 7;
1196 }
1197 const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.5f;
1198 return d * sumi;
1199#else
1200 assert(false);
1201 return 0.f;
1202#endif
1203#else
1204 assert(false);
1205 return 0.f;
1206#endif
1207}
1208
1209static __dpct_inline__ float
1210vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
1211 const block_q8_1 *__restrict__ bq8_1, const int &iqs,
1212 const uint32_t *iq3s_grid) {
1213#if QK_K == 256
1214 const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
1215
1216 const int ib32 = iqs;
1217 const uint8_t * qs = bq2->qs + 8*ib32;
1218 const int8_t * q8 = bq8_1[ib32].qs;
1219 int sumi = 0;
1220 for (int l = 0; l < 4; ++l) {
1221 const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
1222 const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
1223 uint32_t signs0 = dpct::vectorized_binary<sycl::uchar4>(
1224 ((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201,
1225 0x08040201, std::equal_to<>());
1226 uint32_t signs1 = dpct::vectorized_binary<sycl::uchar4>(
1227 ((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201,
1228 0x08040201, std::equal_to<>());
1229 const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
1230 grid1[0] ^ signs0, signs0, std::minus<>());
1231 const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1232 grid2[0] ^ signs1, signs1, std::minus<>());
1233 sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
1234 sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
1235 q8 += 8;
1236 }
1237 const float d =
1238 (float)bq2->d *
1239 (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) *
1240 bq8_1[ib32].ds[0];
1241 return d * sumi;
1242#else
1243 assert(false);
1244#endif
1245}
1246
1247static __dpct_inline__ float
1248vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
1249 const block_q8_1 *__restrict__ bq8_1, const int &iqs,
1250 const uint32_t *iq1s_grid_gpu) {
1251#if QK_K == 256
1252 const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
1253
1254 const int ib32 = iqs;
1255 int sumi = 0;
1256 const int * q8 = (const int *)bq8_1[ib32].qs;
1257 for (int l = 0; l < 4; ++l) {
1258 const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
1259 int grid0 = grid[0] & 0x0f0f0f0f;
1260 int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
1261 sumi = dpct::dp4a(q8[2 * l + 1], grid1,
1262 dpct::dp4a(q8[2 * l + 0], grid0, sumi));
1263 }
1264
1265 const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
1266 const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1);
1267 const float d = d1q * bq8_1[ib32].ds[0];
1268 const float m = d1q * bq8_1[ib32].ds[1];
1269 return d * sumi + m * delta;
1270#else
1271 assert(false);
1272#endif
1273}
1274
1275static __dpct_inline__ float
1276vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
1277 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
1278#if QK_K == 256
1279 const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
1280
1281 const int ib32 = iqs;
1282 int sumi[2] = {0, 0};
1283 float sumf[2] = {0.f, 0.f};
1284
1285 const int * q8 = (const int *)bq8_1[ib32].qs;
1286 for (int l = 0; l < 4; ++l) {
1287 const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
1288 int grid0 = grid[0] & 0x0f0f0f0f;
1289 int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
1290 sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1,
1291 dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2]));
1292 const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
1293 const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101,
1294 dpct::dp4a(q8[2 * l + 0], 0x01010101, 0));
1295 sumf[l/2] += delta*sumy;
1296 }
1297
1298 iq1m_scale_t scale;
1299 const uint16_t * sc = (const uint16_t *)bq1->scales;
1300 scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
1301 const float d = (float)scale.f16 * bq8_1[ib32].ds[0];
1302 return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
1303#else
1304 assert(false);
1305#endif
1306}
1307
1308
1309static __dpct_inline__ float
1310vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
1311 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
1312
1313 const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
1314
1315 const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
1316 const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs;
1317
1318 const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
1319
1320 int v1, v2;
1321 int sumi1 = 0, sumi2 = 0;
1322 for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
1323 const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
1324 get_int_from_table_16(aux, values, v1, v2);
1325 sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1);
1326 sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2);
1327 }
1328
1329 const float d = (float)bq->d * bq8_1->ds[0];
1330 return d * (sumi1 + sumi2);
1331}
1332
1333
1334static __dpct_inline__ float
1335vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
1336 const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
1337
1338#if QK_K == 256
1339 const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
1340 const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
1341
1342 // iqs is 0...7
1343 const int ib32 = iqs;
1344 const int32_t * q8 = (const int *)bq8_1[ib32].qs;
1345 const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
1346 const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
1347 const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0];
1348 int v1, v2;
1349 int sumi1 = 0, sumi2 = 0;
1350 for (int j = 0; j < 4; ++j) {
1351 get_int_from_table_16(q4[j], values, v1, v2);
1352 sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1);
1353 sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
1354 }
1355 return d * (sumi1 + sumi2);
1356#else
1357 assert(false);
1358#endif
1359}
1360
1361#endif // GGML_SYCL_VECDOTQ_HPP