1#if !defined(DATA_A_F32) && !defined(DATA_A_F16)
  2#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require
  3#endif
  4
  5#include "types.glsl"
  6
  7#if defined(DATA_A_F32)
  8vec2 dequantize(uint ib, uint iqs, uint a_offset) {
  9    return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
 10}
 11#endif
 12
 13#if defined(DATA_A_F16)
 14vec2 dequantize(uint ib, uint iqs, uint a_offset) {
 15    return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]);
 16}
 17#endif
 18
 19#if defined(DATA_A_BF16)
 20vec2 dequantize(uint ib, uint iqs, uint a_offset) {
 21    return vec2(bf16_to_fp32(data_a[a_offset + ib]), bf16_to_fp32(data_a[a_offset + ib + 1]));
 22}
 23#endif
 24
 25#if defined(DATA_A_Q4_0)
 26vec2 dequantize(uint ib, uint iqs, uint a_offset) {
 27    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
 28    return (vec2(vui & 0xF, vui >> 4) - 8.0f);
 29}
 30vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
 31    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
 32    return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12) - 8.0f);
 33}
 34#endif
 35
 36#if defined(DATA_A_Q4_1)
 37vec2 dequantize(uint ib, uint iqs, uint a_offset) {
 38    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
 39    return vec2(vui & 0xF, vui >> 4);
 40}
 41vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
 42    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
 43    return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12);
 44}
 45#endif
 46
 47#if defined(DATA_A_Q5_0)
 48vec2 dequantize(uint ib, uint iqs, uint a_offset) {
 49    const uint uint_qh = uint(data_a[a_offset + ib].qh[1]) << 16 | data_a[a_offset + ib].qh[0];
 50    const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
 51    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
 52    return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f);
 53}
 54vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
 55    const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0];
 56    const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
 57    const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
 58    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
 59    return (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f);
 60}
 61#endif
 62
 63#if defined(DATA_A_Q5_1)
 64vec2 dequantize(uint ib, uint iqs, uint a_offset) {
 65    const uint uint_qh = data_a[a_offset + ib].qh;
 66    const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
 67    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
 68    return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y);
 69}
 70vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
 71    const uint uint_qh = data_a_packed16[a_offset + ib].qh;
 72    const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
 73    const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
 74    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
 75    return vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y);
 76}
 77#endif
 78
 79#if defined(DATA_A_Q8_0)
 80vec2 dequantize(uint ib, uint iqs, uint a_offset) {
 81    return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1]));
 82}
 83vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
 84    const i8vec2 v0 = unpack8(int32_t(data_a_packed16[a_offset + ib].qs[iqs/2])).xy; // vec4 used due to #12147
 85    const i8vec2 v1 = unpack8(int32_t(data_a_packed16[a_offset + ib].qs[iqs/2 + 1])).xy;
 86    return vec4(v0.x, v0.y, v1.x, v1.y);
 87}
 88#endif
 89
 90#if defined(DATA_A_IQ1_S)
 91vec2 dequantize(uint ib, uint iqs, uint a_offset) {
 92    const uint ib32 = iqs / 32;
 93    const uint ib8 = iqs / 8;
 94    const int i8 = int(iqs % 8);
 95    const uint qh = data_a[a_offset + ib].qh[ib32];
 96    const uint qs = data_a[a_offset + ib].qs[ib8];
 97    const float dl = float(2 * bitfieldExtract(qh, 12, 3) + 1);
 98    const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
 99    const uint idxhi = bitfieldExtract(qh, 3 * int(ib8 & 3), 3);
100    const int16_t grid = int16_t(iq1s_grid[qs | (idxhi << 8)]);
101    // Signed bitfield extract.
102    const ivec2 gvec = ivec2(
103      bitfieldExtract(grid, 2 * (i8), 2),
104      bitfieldExtract(grid, 2 * (i8 + 1), 2)
105    );
106    return dl * (vec2(gvec) + delta);
107}
108vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
109    const uint ib32 = iqs / 32;
110    const uint ib8 = iqs / 8;
111    const int i8 = int(iqs % 8);
112    const uint qh = data_a[a_offset + ib].qh[ib32];
113    const uint qs = data_a[a_offset + ib].qs[ib8];
114    const float dl = 2 * bitfieldExtract(qh, 12, 3) + 1;
115    const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
116    const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]);
117    // Signed bitfield extract.
118    const ivec4 gvec = ivec4(
119      bitfieldExtract(grid, 2 * (i8), 2),
120      bitfieldExtract(grid, 2 * (i8 + 1), 2),
121      bitfieldExtract(grid, 2 * (i8 + 2), 2),
122      bitfieldExtract(grid, 2 * (i8 + 3), 2)
123    );
124    return dl * (vec4(gvec) + delta);
125}
126#endif
127
128#if defined(DATA_A_IQ1_M)
129vec2 dequantize(uint ib, uint iqs, uint a_offset) {
130    const uint ib8 = iqs / 8;
131    const uint ib16 = iqs / 16;
132    const int i8 = int(iqs % 8);
133    const uint sc = data_a[a_offset + ib].scales[iqs / 64];
134    const uint qs = data_a[a_offset + ib].qs[ib8];
135    const uint qh = data_a[a_offset + ib].qh[ib16] >> (4 * (ib8 & 1));
136    const float dl = 2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1;
137    const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
138    const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]);
139    // Signed bitfield extract.
140    const ivec2 gvec = ivec2(
141      bitfieldExtract(grid, 2 * (i8), 2),
142      bitfieldExtract(grid, 2 * (i8 + 1), 2)
143    );
144    return dl * (vec2(gvec) + delta);
145}
146vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
147    const uint ib8 = iqs / 8;
148    const uint ib16 = iqs / 16;
149    const int i8 = int(iqs % 8);
150    const uint sc = data_a[a_offset + ib].scales[iqs / 64];
151    const uint qs = data_a[a_offset + ib].qs[ib8];
152    const uint qh = data_a[a_offset + ib].qh[ib16] >> (4 * (ib8 & 1));
153    const float dl = 2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1;
154    const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA;
155    const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]);
156    // Signed bitfield extract.
157    const ivec4 gvec = ivec4(
158      bitfieldExtract(grid, 2 * (i8), 2),
159      bitfieldExtract(grid, 2 * (i8 + 1), 2),
160      bitfieldExtract(grid, 2 * (i8 + 2), 2),
161      bitfieldExtract(grid, 2 * (i8 + 3), 2)
162    );
163    return dl * (vec4(gvec) + delta);
164}
165#endif
166
167#if defined(DATA_A_IQ2_XXS)
168vec2 dequantize(uint ib, uint iqs, uint a_offset) {
169    const uint ib32 = iqs / 32;
170    const uint ib8 = (iqs / 8) % 4;
171    const uint qs = data_a[a_offset + ib].qs[8 * ib32 + ib8];
172    // Scales are stored as packed 7+7+7+7+4 bits (4 sign tuples and 1 int4 scale)
173    const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[4 * ib32 + 2],
174        data_a_packed16[a_offset + ib].qs[4 * ib32 + 3]));
175    const float db = 0.25 * (0.5 + (signs >> 28));
176    const uint sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
177    // Add parity bit
178    const uint sign8 = sign7 | (bitCount(sign7) << 7);
179    const uint sign = sign8 >> (iqs % 8);
180    const u8vec4 grid = unpack8(iq2xxs_grid[qs][(iqs % 8) / 4] >> (8 * (iqs % 4)));
181    bool sign0 = (sign & 1) != 0;
182    bool sign1 = (sign & 2) != 0;
183    return db * vec2(
184        grid.x * (sign0 ? -1.0 : 1.0),
185        grid.y * (sign1 ? -1.0 : 1.0)
186    );
187}
188vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
189    const uint ib32 = iqs / 32;
190    const uint ib8 = (iqs / 8) % 4;
191    const uint qs = data_a[a_offset + ib].qs[8 * ib32 + ib8];
192    // Scales are stored as packed 7+7+7+7+4 bits (4 sign tuples and 1 int4 scale)
193    const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[4 * ib32 + 2],
194        data_a_packed16[a_offset + ib].qs[4 * ib32 + 3]));
195    const float db = 0.25 * (0.5 + (signs >> 28));
196    const uint sign7 = bitfieldExtract(signs, 7 * int(ib8), 7);
197    // Add parity bit
198    const uint sign8 = sign7 | (bitCount(sign7) << 7);
199    const uint sign = sign8 >> (iqs % 8);
200    const u8vec4 grid = unpack8(iq2xxs_grid[qs][(iqs % 8) / 4] >> (8 * (iqs % 4)));
201    bool sign0 = (sign & 1) != 0;
202    bool sign1 = (sign & 2) != 0;
203    bool sign2 = (sign & 4) != 0;
204    bool sign3 = (sign & 8) != 0;
205    return db * vec4(
206        grid.x * (sign0 ? -1.0 : 1.0),
207        grid.y * (sign1 ? -1.0 : 1.0),
208        grid.z * (sign2 ? -1.0 : 1.0),
209        grid.w * (sign3 ? -1.0 : 1.0)
210    );
211}
212#endif
213
214#if defined(DATA_A_IQ2_XS)
215vec2 dequantize(uint ib, uint iqs, uint a_offset) {
216    const uint scale = (data_a[a_offset + ib].scales[iqs / 32] >> (4 * ((iqs / 16) & 1))) & 0xf;
217    const uint qs = data_a[a_offset + ib].qs[iqs / 8];
218    const float db = 0.25 * (0.5 + scale);
219    const uint sign7 = qs >> 9;
220    // Add parity bit
221    const uint sign8 = sign7 | (bitCount(sign7) << 7);
222    const uint sign = sign8 >> (iqs % 8);
223    const u8vec4 grid = unpack8(iq2xs_grid[qs & 511][(iqs % 8) / 4] >> (8 * (iqs % 4)));
224    bool sign0 = (sign & 1) != 0;
225    bool sign1 = (sign & 2) != 0;
226    return db * vec2(
227        grid.x * (sign0 ? -1.0 : 1.0),
228        grid.y * (sign1 ? -1.0 : 1.0)
229    );
230}
231vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
232    const uint scale = (data_a[a_offset + ib].scales[iqs / 32] >> (4 * ((iqs / 16) & 1))) & 0xf;
233    const uint qs = data_a[a_offset + ib].qs[iqs / 8];
234    const float db = 0.25 * (0.5 + scale);
235    const uint sign7 = qs >> 9;
236    // Add parity bit
237    const uint sign8 = sign7 | (bitCount(sign7) << 7);
238    const uint sign = sign8 >> (iqs % 8);
239    const u8vec4 grid = unpack8(iq2xs_grid[qs & 511][(iqs % 8) / 4] >> (8 * (iqs % 4)));
240    bool sign0 = (sign & 1) != 0;
241    bool sign1 = (sign & 2) != 0;
242    bool sign2 = (sign & 4) != 0;
243    bool sign3 = (sign & 8) != 0;
244    return db * vec4(
245        grid.x * (sign0 ? -1.0 : 1.0),
246        grid.y * (sign1 ? -1.0 : 1.0),
247        grid.z * (sign2 ? -1.0 : 1.0),
248        grid.w * (sign3 ? -1.0 : 1.0)
249    );
250}
251#endif
252
253#if defined(DATA_A_IQ2_S)
254vec2 dequantize(uint ib, uint iqs, uint a_offset) {
255    const uint ib32 = iqs / 32;
256    const uint ib8 = iqs / 8;
257
258    const uint scale = (data_a[a_offset + ib].scales[ib32] >> (4 * ((iqs / 16) & 1))) & 0xf;
259    const uint qs = data_a[a_offset + ib].qs[ib8];
260    const uint qh = data_a[a_offset + ib].qh[ib32];
261    const uint qhshift = 2 * (ib8 % 4);
262    const uint sign = data_a[a_offset + ib].qs[QUANT_K / 8 + ib8] >> (iqs % 8);
263
264    const float db = 0.25 * (0.5 + scale);
265    const u8vec4 grid = unpack8(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(iqs % 8) / 4]);
266    bool sign0 = (sign & 1) != 0;
267    bool sign1 = (sign & 2) != 0;
268    return db * vec2(
269        grid[iqs % 4] * (sign0 ? -1.0 : 1.0),
270        grid[(iqs % 4) + 1] * (sign1 ? -1.0 : 1.0)
271    );
272}
273vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
274    const uint ib32 = iqs / 32;
275    const uint ib8 = iqs / 8;
276
277    const uint scale = (data_a[a_offset + ib].scales[ib32] >> (4 * ((iqs / 16) & 1))) & 0xf;
278    const uint qs = data_a[a_offset + ib].qs[ib8];
279    const uint qh = data_a[a_offset + ib].qh[ib32];
280    const uint qhshift = 2 * (ib8 % 4);
281    const uint sign = data_a[a_offset + ib].qs[QUANT_K / 8 + ib8] >> (iqs % 8);
282
283    const float db = 0.25 * (0.5 + scale);
284    const u8vec4 grid = unpack8(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(iqs % 8) / 4]);
285    bool sign0 = (sign & 1) != 0;
286    bool sign1 = (sign & 2) != 0;
287    bool sign2 = (sign & 4) != 0;
288    bool sign3 = (sign & 8) != 0;
289    return db * vec4(
290        grid.x * (sign0 ? -1.0 : 1.0),
291        grid.y * (sign1 ? -1.0 : 1.0),
292        grid.z * (sign2 ? -1.0 : 1.0),
293        grid.w * (sign3 ? -1.0 : 1.0)
294    );
295}
296#endif
297
298#if defined(DATA_A_IQ3_XXS)
299vec2 dequantize(uint ib, uint iqs, uint a_offset) {
300    const uint ib4 = iqs / 4;
301    const uint ib32 = iqs / 32;
302    const uint is = QUANT_K / 4 + 4 * ib32;
303    const uint qs = data_a[a_offset + ib].qs[ib4];
304    // Scales are stored as packed 7+7+7+7+4 bits (4 sign tuples and 1 int4 scale)
305    const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[is / 2],
306        data_a_packed16[a_offset + ib].qs[is / 2 + 1]));
307    const float db = 0.5 * (0.5 + (signs >> 28));
308    const uint sign7 = bitfieldExtract(signs, 7 * (int(ib4 / 2) % 4), 7);
309    // Add parity bit
310    const uint sign8 = sign7 | (bitCount(sign7) << 7);
311    const uint sign = sign8 >> (iqs % 8);
312    const u8vec4 grid = unpack8(iq3xxs_grid[qs] >> (8 * (iqs % 4)));
313    bool sign0 = (sign & 1) != 0;
314    bool sign1 = (sign & 2) != 0;
315    return db * vec2(
316        grid.x * (sign0 ? -1.0 : 1.0),
317        grid.y * (sign1 ? -1.0 : 1.0)
318    );
319}
320vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
321    const uint ib4 = iqs / 4;
322    const uint ib32 = iqs / 32;
323    const uint is = QUANT_K / 4 + 4 * ib32;
324    const uint qs = data_a[a_offset + ib].qs[ib4];
325    const uint signs = pack32(u16vec2(data_a_packed16[a_offset + ib].qs[is / 2],
326        data_a_packed16[a_offset + ib].qs[is / 2 + 1]));
327    const float db = 0.5 * (0.5 + (signs >> 28));
328    const uint sign7 = bitfieldExtract(signs, 7 * (int(ib4 / 2) % 4), 7);
329    // Add parity bit
330    const uint sign8 = sign7 | (bitCount(sign7) << 7);
331    const uint sign = sign8 >> (iqs % 8);
332    const u8vec4 grid = unpack8(iq3xxs_grid[qs]);
333    bool sign0 = (sign & 1) != 0;
334    bool sign1 = (sign & 2) != 0;
335    bool sign2 = (sign & 4) != 0;
336    bool sign3 = (sign & 8) != 0;
337    return db * vec4(
338        grid.x * (sign0 ? -1.0 : 1.0),
339        grid.y * (sign1 ? -1.0 : 1.0),
340        grid.z * (sign2 ? -1.0 : 1.0),
341        grid.w * (sign3 ? -1.0 : 1.0)
342    );
343}
344#endif
345
346#if defined(DATA_A_IQ3_S)
347vec2 dequantize(uint ib, uint iqs, uint a_offset) {
348    const uint qs = data_a[a_offset + ib].qs[iqs / 4];
349    const uint qh = data_a[a_offset + ib].qh[iqs / 32];
350    const uint sign = data_a[a_offset + ib].signs[iqs / 8] >> (iqs % 8);
351    const uint scale = data_a[a_offset + ib].scales[iqs / 64];
352    bool sign0 = (sign & 1) != 0;
353    bool sign1 = (sign & 2) != 0;
354    const float db = 1 + 2 * ((scale >> (4 * ((iqs / 32) & 1))) & 0xf);
355    const uint32_t grid = iq3s_grid[qs | ((qh << (8 - ((iqs / 4) % 8))) & 256)] >> (8 * (iqs % 4));
356    return db * vec2(
357        int(grid & 0xFF) * (sign0 ? -1.0 : 1.0),
358        int((grid >> 8) & 0xFF) * (sign1 ? -1.0 : 1.0)
359    );
360}
361vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
362    const uint ib4 = iqs / 4;
363    const uint ib32 = iqs / 32;
364    const uint qs = data_a[a_offset + ib].qs[ib4];
365    const uint qh = data_a[a_offset + ib].qh[ib32];
366    const uint sign = data_a[a_offset + ib].signs[iqs / 8] >> (iqs % 8);
367    const uint scale = data_a[a_offset + ib].scales[ib32 / 2];
368    bool sign0 = (sign & 1) != 0;
369    bool sign1 = (sign & 2) != 0;
370    bool sign2 = (sign & 4) != 0;
371    bool sign3 = (sign & 8) != 0;
372    const float db = 1 + 2 * ((scale >> (4 * (ib32 & 1))) & 0xf);
373    const uint32_t grid = iq3s_grid[qs | ((qh << (8 - ib4 % 8)) & 256)] >> (8 * (iqs % 4));
374    return db * vec4(
375        int(grid & 0xFF) * (sign0 ? -1.0 : 1.0),
376        int((grid >> 8) & 0xFF) * (sign1 ? -1.0 : 1.0),
377        int((grid >> 16) & 0xFF) * (sign2 ? -1.0 : 1.0),
378        int((grid >> 24) & 0xFF) * (sign3 ? -1.0 : 1.0)
379    );
380}
381#endif
382
383#if defined(DATA_A_IQ4_XS)
384vec2 dequantize(uint ib, uint iqs, uint a_offset) {
385    const uint ib32 = iqs / 32;
386    const uint iq = 16 * ib32 + (iqs % 16);
387
388    const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
389    const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
390    const uint qshift = (iqs & 16) >> 2;
391    u8vec2 qs = u8vec2(data_a[a_offset + ib].qs[iq], data_a[a_offset + ib].qs[iq + 1]);
392    qs = (qs >> qshift) & uint8_t(0xF);
393
394    const float dl = float(int(sl | (sh << 4)) - 32);
395    return dl * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
396}
397vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
398    const uint ib32 = iqs / 32;
399    const uint iq = 16 * ib32 + (iqs % 16);
400
401    const uint sl = (data_a[a_offset + ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
402    const uint sh = (data_a[a_offset + ib].scales_h >> (2 * ib32)) & 3;
403    const uint qshift = (iqs & 16) >> 2;
404    const u8vec4 qs = unpack8((data_a_packed32[a_offset + ib].qs[iq/4] >> qshift) & 0x0F0F0F0F);
405
406    const float dl = float(int(sl | (sh << 4)) - 32);
407    return dl * vec4(
408        kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y],
409        kvalues_iq4nl[qs.z], kvalues_iq4nl[qs.w]);
410}
411#endif
412
413#if defined(DATA_A_IQ4_NL)
414vec2 dequantize(uint ib, uint iqs, uint a_offset) {
415    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
416    return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]);
417}
418vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
419    const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
420    return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]);
421}
422#endif
423
424#if defined(DATA_A_MXFP4)
425vec2 dequantize(uint ib, uint iqs, uint a_offset) {
426    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
427    return vec2(kvalues_mxfp4[vui & 0xF], kvalues_mxfp4[vui >> 4]) * 0.5;
428}
429vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
430    vec2 v0 = dequantize(ib, iqs, a_offset);
431    vec2 v1 = dequantize(ib, iqs + 1, a_offset);
432    return vec4(v0.x, v0.y, v1.x, v1.y);
433}
434#endif
435
436#if defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)
437vec2 get_dm(uint ib, uint a_offset) {
438    return vec2(0, 0);
439}
440#endif
441
442#if defined(DATA_A_IQ1_M)
443vec2 get_dm(uint ib, uint a_offset) {
444    const uint16_t[4] scales = data_a[a_offset + ib].scales;
445    const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12;
446    const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x);
447    return vec2(d, 0);
448}
449#endif
450
451#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ1_S) || defined(DATA_A_IQ2_XXS) || defined(DATA_A_IQ2_XS) || defined(DATA_A_IQ2_S) || defined(DATA_A_IQ3_XXS) || defined(DATA_A_IQ3_S) || defined(DATA_A_IQ4_XS) || defined(DATA_A_IQ4_NL)
452vec2 get_dm(uint ib, uint a_offset) {
453    return vec2(float(data_a[a_offset + ib].d), 0);
454}
455#endif
456
457#if defined(DATA_A_MXFP4)
458vec2 get_dm(uint ib, uint a_offset) {
459    return vec2(e8m0_to_fp32(data_a[a_offset + ib].e), 0);
460}
461#endif
462
463#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
464vec2 get_dm(uint ib, uint a_offset) {
465    const vec2 dm = vec2(data_a_packed32[a_offset + ib].dm);
466    return dm;
467}
468#endif
469
470#if defined(DATA_A_Q2_K)
471vec2 dequantize(uint ib, uint iqs, uint a_offset) {
472    iqs /= 2;
473    const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30
474    const uint scalesi = iqs / 8;                      // 0..15
475    const uint qsshift = ((iqs % 64) / 16) * 2;        // 0,2,4,6
476
477    const uvec2 qs = uvec2(data_a[a_offset + ib].qs[qsi], data_a[a_offset + ib].qs[qsi + 1]);
478    const uint scales = data_a[a_offset + ib].scales[scalesi];
479    const vec2 dm = vec2(data_a[a_offset + ib].dm);
480
481    return dm.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - dm.y * float(scales >> 4);
482}
483vec2 get_dm(uint ib, uint a_offset) {
484    return vec2(1, 0);
485}
486#endif
487
488#if defined(DATA_A_Q3_K)
489vec2 dequantize(uint ib, uint iqs, uint a_offset) {
490    iqs /= 2;
491    const uint n = iqs / 64;                     // 0,1
492    const uint qsi = n * 32 + (iqs % 16) * 2;    // 0,2,4..62
493    const uint hmi =          (iqs % 16) * 2;    // 0,2,4..30
494    const uint j = (iqs % 64) / 4;               // 0..3
495    const uint is = iqs / 8;                     // 0..15
496    const uint halfsplit = ((iqs % 64) / 16);    // 0,1,2,3
497    const uint qsshift = halfsplit * 2;          // 0,2,4,6
498    const uint m = 1 << (4 * n + halfsplit);     // 1,2,4,8,16,32,64,128
499
500    const int8_t us = int8_t(((data_a[a_offset + ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF)
501                          | (((data_a[a_offset + ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4));
502    const float dl = float(data_a[a_offset + ib].d) * float(us - 32);
503
504    return vec2(dl * float(int8_t((data_a[a_offset + ib].qs[qsi    ] >> qsshift) & 3) - (((data_a[a_offset + ib].hmask[hmi    ] & m) != 0) ? 0 : 4)),
505                dl * float(int8_t((data_a[a_offset + ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[a_offset + ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));
506}
507vec2 get_dm(uint ib, uint a_offset) {
508    return vec2(1, 0);
509}
510#endif
511
512#if defined(DATA_A_Q4_K)
513vec2 dequantize(uint ib, uint iqs, uint a_offset) {
514    iqs /= 2;
515    const uint n = iqs / 32;                   // 0,1,2,3
516    const uint b = (iqs % 32) / 16;            // 0,1
517    const uint is = 2 * n + b;                 // 0..7
518    const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
519
520    const vec2 loadd = vec2(data_a[a_offset + ib].dm);
521
522    const uint scidx0 = (is < 4) ? is : (is + 4);
523    const uint scidx1 = (is < 4) ? is : (is - 4);
524    const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
525    const uint scidxshift1 = (is < 4) ? 0 : 2;
526    const uint mbidx0 = is + 4;
527    const uint mbidx1 = (is < 4) ? is + 4 : is;
528    const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
529    const uint mbidxshift0 = (is < 4) ? 0 : 4;
530    const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
531    const uint mbidxshift1 = (is < 4) ? 0 : 2;
532
533    const uint8_t sc = uint8_t((data_a[a_offset + ib].scales[scidx0] & 0xF) | ((data_a[a_offset + ib].scales[scidx1] & scidxmask1) >> scidxshift1));
534    const uint8_t mbyte = uint8_t((data_a[a_offset + ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[a_offset + ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
535
536    const float d = loadd.x * sc;
537    const float m = -loadd.y * mbyte;
538
539    return vec2(fma(d, float((data_a[a_offset + ib].qs[qsi    ] >> (b * 4)) & 0xF), m),
540                fma(d, float((data_a[a_offset + ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
541}
542vec2 get_dm(uint ib, uint a_offset) {
543    return vec2(1, 0);
544}
545#endif
546
547#if defined(DATA_A_Q5_K)
548vec2 dequantize(uint ib, uint iqs, uint a_offset) {
549    iqs /= 2;
550    const uint n = iqs / 32;                   // 0,1,2,3
551    const uint b = (iqs % 32) / 16;            // 0,1
552    const uint is = 2 * n + b;                 // 0..7
553    const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126
554    const uint qhi = (iqs % 16) * 2;           // 0,2,4..30
555
556    const uint8_t hm = uint8_t(1 << (iqs / 16));
557
558    const vec2 loadd = vec2(data_a[a_offset + ib].dm);
559
560    const uint scidx0 = (is < 4) ? is : (is + 4);
561    const uint scidx1 = (is < 4) ? is : (is - 4);
562    const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
563    const uint scidxshift1 = (is < 4) ? 0 : 2;
564    const uint mbidx0 = is + 4;
565    const uint mbidx1 = (is < 4) ? is + 4 : is;
566    const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
567    const uint mbidxshift0 = (is < 4) ? 0 : 4;
568    const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
569    const uint mbidxshift1 = (is < 4) ? 0 : 2;
570
571    const uint8_t sc    = uint8_t((data_a[a_offset + ib].scales[scidx0] & 0xF)                         | ((data_a[a_offset + ib].scales[scidx1] & scidxmask1) >> scidxshift1));
572    const uint8_t mbyte = uint8_t(((data_a[a_offset + ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[a_offset + ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
573
574    const float d = loadd.x * sc;
575    const float m = -loadd.y * mbyte;
576
577    return vec2(fma(d, float((data_a[a_offset + ib].qs[qsi    ] >> (b * 4)) & 0xF) + float((data_a[a_offset + ib].qh[qhi    ] & hm) != 0 ? 16 : 0), m),
578                fma(d, float((data_a[a_offset + ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[a_offset + ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
579}
580vec2 get_dm(uint ib, uint a_offset) {
581    return vec2(1, 0);
582}
583#endif
584
585#if defined(DATA_A_Q6_K)
586vec2 dequantize(uint ib, uint iqs, uint a_offset) {
587    iqs /= 2;
588    const uint n = iqs / 64;                    // 0,1
589    const uint b = (iqs % 64) / 32;             // 0,1
590    const uint is_b = (iqs % 16) / 8;           // 0,1
591    const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
592    const uint is = 8 * n + qhshift + is_b;     // 0..15
593    const uint qsi = n * 64 + (iqs % 32) * 2;   // 0,2,4..126
594    const uint qhi = n * 32 + (iqs % 16) * 2;   // 0,2,4..62
595
596    const float dscale = float(data_a[a_offset + ib].d) * float(data_a[a_offset + ib].scales[is]);
597
598    return vec2(dscale * float(int8_t(((data_a[a_offset + ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[a_offset + ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32),
599                dscale * float(int8_t(((data_a[a_offset + ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[a_offset + ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
600}
601vec2 get_dm(uint ib, uint a_offset) {
602    return vec2(1, 0);
603}
604#endif