1#version 450
2
3#include "types.glsl"
4#include "generic_unary_head.glsl"
5#include "dequant_funcs.glsl"
6
7#if defined(DATA_A_IQ4_NL) || defined(DATA_A_MXFP4)
8// 16 invocations needed for init_iq_shmem
9layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
10#else
11layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
12#endif
13
14void main() {
15#ifdef NEEDS_INIT_IQ_SHMEM
16 init_iq_shmem(gl_WorkGroupSize);
17 if (gl_LocalInvocationIndex.x != 0) {
18 return;
19 }
20#endif
21
22 const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K;
23
24 if (idx >= p.ne) {
25 return;
26 }
27
28 uint dst_idx = get_doffset() + dst_idx(idx);
29 uint src_idx = src0_idx_quant(idx, QUANT_K);
30
31 const uint a_offset = 0;
32 const uint ib = src_idx;
33 const vec2 dm = get_dm(ib, a_offset);
34
35 [[unroll]] for (int j = 0; j < QUANT_K; j += 4) {
36 vec4 v = dequantize4(ib, j / QUANT_R, a_offset);
37 v = v * dm.x + vec4(dm.y);
38
39#if QUANT_R == 2
40 data_d[dst_idx + j/2 + 0] = v[0];
41 data_d[dst_idx + j/2 + QUANT_K/2 + 0] = v[1];
42 data_d[dst_idx + j/2 + 1] = v[2];
43 data_d[dst_idx + j/2 + QUANT_K/2 + 1] = v[3];
44#else
45 data_d[dst_idx + j + 0] = v[0];
46 data_d[dst_idx + j + 1] = v[1];
47 data_d[dst_idx + j + 2] = v[2];
48 data_d[dst_idx + j + 3] = v[3];
49#endif
50 }
51}