1//
2// MIT license
3// Copyright (C) 2024 Intel Corporation
4// SPDX-License-Identifier: MIT
5//
6
7//
8// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9// See https://llvm.org/LICENSE.txt for license information.
10// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11//
12
13#ifndef GGML_SYCL_DEQUANTIZE_HPP
14#define GGML_SYCL_DEQUANTIZE_HPP
15
16#include "common.hpp"
17
18typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
19typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs,
20 const int iqs, dfloat2 &v);
21
22static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
23 const int iqs, dfloat2 &v) {
24 const block_q4_0 * x = (const block_q4_0 *) vx;
25
26 const dfloat d = x[ib].d;
27
28 const int vui = x[ib].qs[iqs];
29
30 v.x() = vui & 0xF;
31 v.y() = vui >> 4;
32
33#ifdef GGML_SYCL_F16
34 // v = v - {8.0f, 8.0f};
35 // v = v * {d, d};
36 v.s0() = (v.s0() - 8.0f) * d;
37 v.s1() = (v.s1() - 8.0f) * d;
38
39#else
40 v.x() = (v.x() - 8.0f) * d;
41 v.y() = (v.y() - 8.0f) * d;
42#endif // GGML_SYCL_F16
43}
44
45static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
46 const int iqs, dfloat2 &v) {
47 // const block_q4_0 * x = (const block_q4_0 *) vx;
48
49 const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib);
50
51 const int vui = *((const uint8_t *)qs+iqs);
52
53 v.x() = vui & 0xF;
54 v.y() = vui >> 4;
55
56#ifdef GGML_SYCL_F16
57 // v = v - {8.0f, 8.0f};
58 // v = v * {d, d};
59 v.s0() = (v.s0() - 8.0f) * d;
60 v.s1() = (v.s1() - 8.0f) * d;
61
62#else
63 v.x() = (v.x() - 8.0f) * d;
64 v.y() = (v.y() - 8.0f) * d;
65#endif // GGML_SYCL_F16
66}
67
68static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
69 const int iqs, dfloat2 &v) {
70 const block_q4_1 * x = (const block_q4_1 *) vx;
71
72 const dfloat d = x[ib].dm[0];
73 const dfloat m = x[ib].dm[1];
74
75 const int vui = x[ib].qs[iqs];
76
77 v.x() = vui & 0xF;
78 v.y() = vui >> 4;
79
80#ifdef GGML_SYCL_F16
81 // v = v * {d, d};
82 // v = v + {m, m};
83 v.s0() = sycl::fma(v.s0(), d, m);
84 v.s1() = sycl::fma(v.s1(), d, m);
85
86#else
87 v.x() = sycl::fma(v.x(), d, m);
88 v.y() = sycl::fma(v.y(), d, m);
89#endif // GGML_SYCL_F16
90}
91
92static __dpct_inline__ void dequantize_q5_0(const void *vx, const int64_t ib,
93 const int iqs, dfloat2 &v) {
94 const block_q5_0 * x = (const block_q5_0 *) vx;
95
96 const dfloat d = x[ib].d;
97
98 uint32_t qh;
99 memcpy(&qh, x[ib].qh, sizeof(qh));
100
101 const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
102 const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
103
104 v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
105 v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
106
107#ifdef GGML_SYCL_F16
108 // v = v - {16.0f, 16.0f};
109 // v = v * {d, d};
110 v.s0() = (v.s0() - 16.0f) * d;
111 v.s1() = (v.s1() - 16.0f) * d;
112
113#else
114 v.x() = (v.x() - 16.0f) * d;
115 v.y() = (v.y() - 16.0f) * d;
116#endif // GGML_SYCL_F16
117}
118
119static __dpct_inline__ void dequantize_q5_1(const void *vx, const int64_t ib,
120 const int iqs, dfloat2 &v) {
121 const block_q5_1 * x = (const block_q5_1 *) vx;
122
123 const dfloat d = x[ib].dm[0];
124 const dfloat m = x[ib].dm[1];
125
126 uint32_t qh;
127 memcpy(&qh, x[ib].qh, sizeof(qh));
128
129 const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
130 const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
131
132 v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
133 v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
134
135#ifdef GGML_SYCL_F16
136 // v = v * {d, d};
137 // v = v + {m, m};
138 v.s0() = sycl::fma(v.s0(), d, m);
139 v.s1() = sycl::fma(v.s1(), d, m);
140#else
141 v.x() = sycl::fma(v.x(), d, m);
142 v.y() = sycl::fma(v.y(), d, m);
143#endif // GGML_SYCL_F16
144}
145
146static __dpct_inline__ void dequantize_q8_0(const void *vx, const int64_t ib,
147 const int iqs, dfloat2 &v) {
148 const block_q8_0 * x = (const block_q8_0 *) vx;
149
150 const dfloat d = x[ib].d;
151
152 v.x() = x[ib].qs[iqs + 0];
153 v.y() = x[ib].qs[iqs + 1];
154
155#ifdef GGML_SYCL_F16
156 // v = v * {d, d};
157 v.s0() *= d;
158 v.s1() *= d;
159#else
160 v.x() *= d;
161 v.y() *= d;
162#endif // GGML_SYCL_F16
163}
164
165template<typename dst_t>
166static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
167 const sycl::nd_item<3> &item_ct1) {
168
169 const int64_t i = item_ct1.get_group(2);
170
171 // assume 32 threads
172 const int64_t tid = item_ct1.get_local_id(2);
173 const int64_t il = tid/8;
174 const int64_t ir = tid%8;
175 const int64_t ib = 8*i + ir;
176 if (ib >= nb32) {
177 return;
178 }
179
180 dst_t * y = yy + 256*i + 32*ir + 4*il;
181
182 const block_q4_0 * x = (const block_q4_0 *)vx + ib;
183 const float d = sycl::vec<sycl::half, 1>(x->d)
184 .convert<float, sycl::rounding_mode::automatic>()[0];
185 const float dm = -8*d;
186
187 const uint8_t * q = x->qs + 4*il;
188
189 for (int l = 0; l < 4; ++l) {
190 y[l+ 0] = d * (q[l] & 0xF) + dm;
191 y[l+16] = d * (q[l] >> 4) + dm;
192 }
193}
194
195template<typename dst_t>
196static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
197 const sycl::nd_item<3> &item_ct1) {
198
199 const int64_t i = item_ct1.get_group(2);
200 auto k=nb32;
201 // assume 32 threads
202 const int64_t tid = item_ct1.get_local_id(2);
203 const int lane_ib = i * WARP_SIZE + tid;
204
205 if (lane_ib >= k / QK4_0) {
206 return;
207 }
208
209 dst_t * y_ptr = yy + lane_ib * QK4_0;
210
211 auto qs = (const uint8_t*)vx + lane_ib * QK4_0 / 2;
212 auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k / 2) + lane_ib;
213
214 const float d = float(*s_ptr);
215
216#pragma unroll
217 for (int l = 0; l < QK4_0 / 2; ++l) {
218 int vq = qs[l];
219 y_ptr[l + 0] = d * ((vq & 0xF) - 8);
220 y_ptr[l + 16] = d * ((vq >> 4) - 8);
221 }
222
223}
224
225template<typename dst_t>
226static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
227 const sycl::nd_item<3> &item_ct1) {
228
229 const int64_t i = item_ct1.get_group(2);
230
231 // assume 32 threads
232 const int64_t tid = item_ct1.get_local_id(2);
233 const int64_t il = tid/8;
234 const int64_t ir = tid%8;
235 const int64_t ib = 8*i + ir;
236 if (ib >= nb32) {
237 return;
238 }
239
240 dst_t * y = yy + 256*i + 32*ir + 4*il;
241
242 const block_q4_1 * x = (const block_q4_1 *)vx + ib;
243 const sycl::float2 d =
244 x->dm.convert<float, sycl::rounding_mode::automatic>();
245
246 const uint8_t * q = x->qs + 4*il;
247
248 for (int l = 0; l < 4; ++l) {
249 y[l + 0] = d.x() * (q[l] & 0xF) + d.y();
250 y[l + 16] = d.x() * (q[l] >> 4) + d.y();
251 }
252}
253
254
255//================================== k-quants
256
257template<typename dst_t>
258static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
259 const sycl::nd_item<3> &item_ct1) {
260
261 const int64_t i = item_ct1.get_group(2);
262 const block_q2_K * x = (const block_q2_K *) vx;
263
264 const int64_t tid = item_ct1.get_local_id(2);
265#if QK_K == 256
266 const int64_t n = tid/32;
267 const int64_t l = tid - 32*n;
268 const int64_t is = 8*n + l/16;
269
270 const uint8_t q = x[i].qs[32*n + l];
271 dst_t * y = yy + i*QK_K + 128*n;
272
273 float dall = x[i].dm[0];
274 float dmin = x[i].dm[1];
275 y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
276 y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
277 y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
278 y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
279#else
280 const int64_t is = tid/16; // 0 or 1
281 const int64_t il = tid%16; // 0...15
282 const uint8_t q = x[i].qs[il] >> (2*is);
283 dst_t * y = yy + i*QK_K + 16*is + il;
284
285 float dall = x[i].dm[0];
286 float dmin = x[i].dm[1];
287 y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
288 y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
289#endif
290
291}
292
293template<typename dst_t>
294static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
295 const sycl::nd_item<3> &item_ct1) {
296
297 const int64_t i = item_ct1.get_group(2);
298 const block_q3_K * x = (const block_q3_K *) vx;
299
300#if QK_K == 256
301 const int64_t r = item_ct1.get_local_id(2) / 4;
302 const int64_t tid = r/2;
303 const int64_t is0 = r%2;
304 const int64_t l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
305 const int64_t n = tid / 4;
306 const int64_t j = tid - 4*n;
307
308 uint8_t m = 1 << (4*n + j);
309 int64_t is = 8*n + 2*j + is0;
310 int shift = 2*j;
311
312 int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
313 is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
314 is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
315 (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
316 float d_all = x[i].d;
317 float dl = d_all * (us - 32);
318
319 dst_t * y = yy + i*QK_K + 128*n + 32*j;
320 const uint8_t * q = x[i].qs + 32*n;
321 const uint8_t * hm = x[i].hmask;
322
323 for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
324#else
325 const int64_t tid = item_ct1.get_local_id(2);
326 const int64_t is = tid/16; // 0 or 1
327 const int64_t il = tid%16; // 0...15
328 const int64_t im = il/8; // 0...1
329 const int64_t in = il%8; // 0...7
330
331 dst_t * y = yy + i*QK_K + 16*is + il;
332
333 const uint8_t q = x[i].qs[il] >> (2*is);
334 const uint8_t h = x[i].hmask[in] >> (2*is + im);
335 const float d = (float)x[i].d;
336
337 if (is == 0) {
338 y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
339 y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
340 } else {
341 y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
342 y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
343 }
344#endif
345
346}
347
348#if QK_K == 256
349static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
350 if (j < 4) {
351 d = q[j] & 63;
352 m = q[j + 4] & 63;
353 } else {
354 d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
355 m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
356 }
357}
358#endif
359
360template <typename dst_t>
361inline void dequantize_q4_K_common(dst_t * __restrict__ y, const uint8_t * __restrict__ qs_ptr, const float dall,
362 const float dmin, uint8_t * __restrict__ scales_local, int il, int ir) {
363 const int is = 2 * il;
364 constexpr int n = 4;
365
366 uint8_t sc, m;
367 get_scale_min_k4(is + 0, scales_local, sc, m);
368 const float d1 = dall * sc;
369 const float m1 = dmin * m;
370
371 get_scale_min_k4(is + 1, scales_local, sc, m);
372 const float d2 = dall * sc;
373 const float m2 = dmin * m;
374
375 sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(qs_ptr + 32 * il + n * ir);
376 for (int l = 0; l < n; ++l) {
377 y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
378 y[l + 32] = d2 * (q_vec[l] >> 4) - m2;
379 }
380}
381
382template<typename dst_t>
383static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
384 uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
385 const block_q4_K * x = (const block_q4_K *) vx;
386
387 const int64_t i = item_ct1.get_group(2);
388
389#if QK_K == 256
390 const int64_t tid = item_ct1.get_local_id(2);
391 const int64_t il = tid / 8;
392 const int64_t ir = tid % 8;
393
394 dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
395
396 const sycl::half2 dm = x[i].dm;
397 const float dall = dm[0];
398 const float dmin = dm[1];
399
400 if (tid < 12) {
401 scales_local[tid] = x[i].scales[tid];
402 }
403
404 item_ct1.barrier(sycl::access::fence_space::local_space);
405 dequantize_q4_K_common(y, x[i].qs, dall, dmin, scales_local, il, ir);
406#else
407 const int64_t tid = item_ct1.get_local_id(2);
408 const uint8_t * q = x[i].qs;
409 dst_t * y = yy + i*QK_K;
410 const float d = (float)x[i].dm[0];
411 const float m = (float)x[i].dm[1];
412 y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
413 y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
414#endif
415}
416
417template <typename dst_t>
418static void dequantize_block_q4_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, uint8_t * scales_local,
419 const sycl::nd_item<1> & item_ct1, int64_t nb) {
420 const int64_t i = item_ct1.get_group(0); // block index
421 const int64_t tid = item_ct1.get_local_id(0); // thread index within block
422 const int64_t il = tid / 8;
423 const int64_t ir = tid % 8;
424
425 dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
426
427 const uint8_t * base = static_cast<const uint8_t *>(vx);
428 const size_t qs_offset = i * (QK_K / 2);
429 const size_t scales_offset = nb * (QK_K / 2) + i * K_SCALE_SIZE;
430 const size_t dm_offset = nb * (QK_K / 2) + nb * K_SCALE_SIZE + i * sizeof(ggml_half2);
431
432 const uint8_t * qs_ptr = base + qs_offset;
433 const uint8_t * scales_ptr = base + scales_offset;
434 ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
435
436 const float dall = dm_values.x();
437 const float dmin = dm_values.y();
438
439 if (tid < 12) {
440 scales_local[tid] = scales_ptr[tid];
441 }
442
443 item_ct1.barrier(sycl::access::fence_space::local_space);
444 dequantize_q4_K_common(y, qs_ptr, dall, dmin, scales_local, il, ir);
445}
446
447template<typename dst_t>
448static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
449 const sycl::nd_item<3> &item_ct1) {
450 const block_q5_K * x = (const block_q5_K *) vx;
451
452 const int64_t i = item_ct1.get_group(2);
453
454#if QK_K == 256
455 // assume 64 threads - this is very slightly better than the one below
456 const int64_t tid = item_ct1.get_local_id(2);
457 const int64_t il = tid/16; // il is in 0...3
458 const int64_t ir = tid%16; // ir is in 0...15
459 const int64_t is = 2*il; // is is in 0...6
460
461 dst_t * y = yy + i*QK_K + 64*il + 2*ir;
462
463 const float dall = x[i].dm[0];
464 const float dmin = x[i].dm[1];
465
466 const uint8_t * ql = x[i].qs + 32*il + 2*ir;
467 const uint8_t * qh = x[i].qh + 2*ir;
468
469 uint8_t sc, m;
470 get_scale_min_k4(is + 0, x[i].scales, sc, m);
471 const float d1 = dall * sc; const float m1 = dmin * m;
472 get_scale_min_k4(is + 1, x[i].scales, sc, m);
473 const float d2 = dall * sc; const float m2 = dmin * m;
474
475 uint8_t hm = 1 << (2*il);
476 y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
477 y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
478 hm <<= 1;
479 y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
480 y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
481#else
482 const int64_t tid = item_ct1.get_local_id(2);
483 const uint8_t q = x[i].qs[tid];
484 const int64_t im = tid/8; // 0...3
485 const int64_t in = tid%8; // 0...7
486 const int64_t is = tid/16; // 0 or 1
487 const uint8_t h = x[i].qh[in] >> im;
488 const float d = x[i].d;
489 dst_t * y = yy + i*QK_K + tid;
490 y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
491 y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
492#endif
493}
494
495template<typename dst_t>
496static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
497 const sycl::nd_item<3> &item_ct1) {
498 const block_q6_K * x = (const block_q6_K *) vx;
499
500 const int64_t i = item_ct1.get_group(2);
501#if QK_K == 256
502
503 // assume 64 threads - this is very slightly better than the one below
504 const int64_t tid = item_ct1.get_local_id(2);
505 const int64_t ip = tid/32; // ip is 0 or 1
506 const int64_t il = tid - 32*ip; // 0...32
507 const int64_t is = 8*ip + il/16;
508
509 dst_t * y = yy + i*QK_K + 128*ip + il;
510
511 const float d = x[i].d;
512
513 const uint8_t * ql = x[i].ql + 64*ip + il;
514 const uint8_t qh = x[i].qh[32*ip + il];
515 const int8_t * sc = x[i].scales + is;
516
517 y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
518 y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
519 y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
520 y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
521#else
522
523 // assume 32 threads
524 const int64_t tid = item_ct1.get_local_id(2);
525 const int64_t ip = tid/16; // 0 or 1
526 const int64_t il = tid - 16*ip; // 0...15
527
528 dst_t * y = yy + i*QK_K + 16*ip + il;
529
530 const float d = x[i].d;
531
532 const uint8_t ql = x[i].ql[16*ip + il];
533 const uint8_t qh = x[i].qh[il] >> (2*ip);
534 const int8_t * sc = x[i].scales;
535
536 y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
537 y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
538#endif
539}
540
541template <typename dst_t>
542static void dequantize_block_q6_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
543 const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
544 const int64_t ib = item_ct1.get_group(2);
545
546 const int64_t tid = item_ct1.get_local_id(2);
547 const int64_t ip = tid / 32; // ip is 0 or 1
548 const int64_t il = tid - 32 * ip; // 0...32
549 const int64_t is = 8 * ip + il / 16;
550
551 const uint8_t * base_ptr = static_cast<const uint8_t *>(vx);
552 const auto ql_offset = ib * (QK_K / 2);
553 const auto qh_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * ib;
554 const auto base_scales_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * n_blocks + (QK_K / 16) * ib;
555 const auto base_d_offset = ((QK_K / 2) + (QK_K / 4) + (QK_K / 16)) * n_blocks;
556 const uint8_t * ql_ptr = base_ptr + ql_offset;
557 const uint8_t * qh_ptr = base_ptr + qh_offset;
558 const uint8_t * scales_ptr = base_ptr + base_scales_offset;
559 const ggml_half * d = (const ggml_half *) (base_ptr + base_d_offset) + ib;
560
561 dst_t * y = yy + ib * QK_K + 128 * ip + il;
562
563 const uint8_t * ql = ql_ptr + 64 * ip + il;
564 const uint8_t qh = *(qh_ptr + 32 * ip + il);
565 const int8_t * sc = reinterpret_cast<const int8_t *>(scales_ptr + is);
566
567 y[0] = *d * sc[0] * ((int8_t) ((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
568 y[32] = *d * sc[2] * ((int8_t) ((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
569 y[64] = *d * sc[4] * ((int8_t) ((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
570 y[96] = *d * sc[6] * ((int8_t) ((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
571}
572
573template<typename dst_t>
574static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
575 const sycl::nd_item<3> &item_ct1,
576 const uint64_t *iq2xxs_grid_ptr,
577 const uint8_t *ksigns_iq2xs_ptr,
578 const uint8_t *kmask_iq2xs_ptr) {
579
580 const int64_t i = item_ct1.get_group(2);
581 const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
582
583 const int64_t tid = item_ct1.get_local_id(2);
584#if QK_K == 256
585 const int64_t il = tid/8; // 0...3
586 const int64_t ib = tid%8; // 0...7
587 dst_t * y = yy + i*QK_K + 32*ib + 8*il;
588 const uint16_t * q2 = x[i].qs + 4*ib;
589 const uint8_t * aux8 = (const uint8_t *)q2;
590 const uint8_t * grid = (const uint8_t *)(iq2xxs_grid_ptr + aux8[il]);
591 const uint32_t aux32 = q2[2] | (q2[3] << 16);
592 const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
593 const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127];
594 for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f);
595#else
596 assert(false);
597#endif
598
599}
600
601template<typename dst_t>
602static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy,
603 const sycl::nd_item<3> &item_ct1,
604 const uint64_t *iq2xs_grid,
605 const uint8_t *ksigns_iq2xs,
606 const uint8_t *kmask_iq2xs) {
607
608 const int64_t i = item_ct1.get_group(2);
609 const block_iq2_xs * x = (const block_iq2_xs *) vx;
610
611 const int64_t tid = item_ct1.get_local_id(2);
612#if QK_K == 256
613 const int64_t il = tid/8; // 0...3
614 const int64_t ib = tid%8; // 0...7
615 dst_t * y = yy + i*QK_K + 32*ib + 8*il;
616 const uint16_t * q2 = x[i].qs + 4*ib;
617 const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
618 const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
619 const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
620 for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
621#else
622 assert(false);
623#endif
624
625}
626
627template <typename dst_t>
628__dpct_inline__ static void
629dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
630 const sycl::nd_item<3> &item_ct1) {
631
632 const int64_t i = item_ct1.get_group(2);
633 const block_iq2_s * x = (const block_iq2_s *) vx;
634
635 const int64_t tid = item_ct1.get_local_id(2);
636#if QK_K == 256
637 const int64_t il = tid/8; // 0...3
638 const int64_t ib = tid%8; // 0...7
639 dst_t * y = yy + i*QK_K + 32*ib + 8*il;
640 const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
641 const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
642 const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
643#pragma unroll
644 for (int j = 0; j < 8; ++j)
645 y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
646#else
647 assert(false);
648
649#endif
650
651}
652
653template<typename dst_t>
654static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy,
655 const sycl::nd_item<3> &item_ct1,
656 const uint32_t *iq3xxs_grid,
657 const uint8_t *ksigns_iq2xs,
658 const uint8_t *kmask_iq2xs) {
659
660 const int64_t i = item_ct1.get_group(2);
661 const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
662
663 const int64_t tid = item_ct1.get_local_id(2);
664#if QK_K == 256
665 const int64_t il = tid/8; // 0...3
666 const int64_t ib = tid%8; // 0...7
667 dst_t * y = yy + i*QK_K + 32*ib + 8*il;
668 const uint8_t * q3 = x[i].qs + 8*ib;
669 const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
670 const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
671 const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
672 const uint32_t aux32 = gas[0] | (gas[1] << 16);
673 const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
674 const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
675 for (int j = 0; j < 4; ++j) {
676 y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
677 y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
678 }
679#else
680 assert(false);
681#endif
682
683}
684
685template <typename dst_t>
686__dpct_inline__ static void
687dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
688 const sycl::nd_item<3> &item_ct1,
689 const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) {
690
691 const int64_t i = item_ct1.get_group(2);
692 const block_iq3_s * x = (const block_iq3_s *) vx;
693
694 const int64_t tid = item_ct1.get_local_id(2);
695#if QK_K == 256
696 const int64_t il = tid/8; // 0...3
697 const int64_t ib = tid%8; // 0...7
698 dst_t * y = yy + i*QK_K + 32*ib + 8*il;
699 const uint8_t * qs = x[i].qs + 8*ib;
700 const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
701 const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
702 const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
703 const uint8_t signs = x[i].signs[4*ib + il];
704#pragma unroll
705 for (int j = 0; j < 4; ++j) {
706 y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
707 y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
708 }
709#else
710 assert(false);
711#endif
712
713}
714
715template <typename dst_t>
716__dpct_inline__ static void
717dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
718 const sycl::nd_item<3> &item_ct1,
719 const uint32_t *iq1s_grid_gpu) {
720
721 const int64_t i = item_ct1.get_group(2);
722 const block_iq1_s * x = (const block_iq1_s *) vx;
723
724 const int64_t tid = item_ct1.get_local_id(2);
725#if QK_K == 256
726 const int64_t il = tid/8; // 0...3
727 const int64_t ib = tid%8; // 0...7
728 dst_t * y = yy + i*QK_K + 32*ib + 8*il;
729 const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
730 const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
731 uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
732 grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)];
733 grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
734 grid32[0] &= 0x0f0f0f0f;
735#pragma unroll
736 for (int j = 0; j < 8; ++j) {
737 y[j] = d * (q[j] + delta);
738 }
739#else
740 assert(false);
741#endif
742
743}
744
745template <typename dst_t>
746__dpct_inline__ static void
747dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
748 const sycl::nd_item<3> &item_ct1,
749 const uint32_t *iq1s_grid_gpu) {
750
751 const int64_t i = item_ct1.get_group(2);
752 const block_iq1_m * x = (const block_iq1_m *) vx;
753
754 const int64_t tid = item_ct1.get_local_id(2);
755#if QK_K == 256
756 const int64_t il = tid/8; // 0...3
757 const int64_t ib = tid%8; // 0...7
758 dst_t * y = yy + i*QK_K + 32*ib + 8*il;
759 const uint16_t * sc = (const uint16_t *)x[i].scales;
760 iq1m_scale_t scale;
761 scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
762 const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
763 const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
764 const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
765 uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
766 grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)];
767 grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
768 grid32[0] &= 0x0f0f0f0f;
769#pragma unroll
770 for (int j = 0; j < 8; ++j) {
771 y[j] = d * (q[j] + delta);
772 }
773#else
774 assert(false);
775#endif
776
777}
778
779template <typename dst_t>
780__dpct_inline__ static void
781dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy,
782 const sycl::nd_item<3> &item_ct1) {
783
784 const int64_t i = item_ct1.get_group(2);
785 const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
786
787 const int64_t tid = item_ct1.get_local_id(2);
788 const int64_t il = tid/8; // 0...3
789 const int64_t ib = tid%8; // 0...7
790 dst_t * y = yy + i*QK_K + 32*ib + 4*il;
791 const uint8_t * q4 = x[ib].qs + 4*il;
792 const float d = (float)x[ib].d;
793#pragma unroll
794 for (int j = 0; j < 4; ++j) {
795 y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
796 y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
797 }
798
799}
800
801
802template <typename dst_t>
803__dpct_inline__ static void
804dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
805 const sycl::nd_item<3> &item_ct1) {
806 const int64_t i = item_ct1.get_group(2);
807 const block_iq4_xs * x = (const block_iq4_xs *)vx;
808
809 const int64_t tid = item_ct1.get_local_id(2);
810 const int64_t il = tid/8; // 0...3
811 const int64_t ib = tid%8; // 0...7
812 dst_t * y = yy + i*QK_K + 32*ib + 4*il;
813 const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
814 const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
815#pragma unroll
816 for (int j = 0; j < 4; ++j) {
817 y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
818 y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
819 }
820}
821
822template<typename dst_t>
823static void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restrict__ yy,
824 const sycl::nd_item<3> &item_ct1) {
825 // auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
826 const int64_t i = item_ct1.get_group(2);
827 const block_mxfp4 * x = (const block_mxfp4 *) vx + i*(QK_K/QK_MXFP4);
828
829 const int64_t tid = item_ct1.get_local_id(2);
830 const int64_t il = tid/8; // 0...3
831 const int64_t ib = tid%8; // 0...7
832 dst_t * y = yy + i*QK_K + 32*ib + 4*il;
833 const uint8_t * q4 = x[ib].qs + 4*il;
834 const float d = ggml_sycl_e8m0_to_fp32(x[ib].e);
835 for (int j = 0; j < 4; ++j) {
836 y[j+ 0] = d * kvalues_mxfp4[q4[j] & 0xf]*0.5f;
837 y[j+16] = d * kvalues_mxfp4[q4[j] >> 4]*0.5f;
838 }
839}
840
841#endif // GGML_SYCL_DEQUANTIZE_HPP