diff options
Diffstat (limited to 'llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders')
25 files changed, 6644 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl new file mode 100644 index 0000000..ca5bfcc --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl @@ -0,0 +1,72 @@ +@group(0) @binding(0) +#ifdef VEC4 +var<storage, read_write> src: array<vec4<f32>>; +#define VEC_SIZE 4 +#else +var<storage, read_write> src: array<f32>; +#define VEC_SIZE 1 +#endif + +@group(0) @binding(1) +var<storage, read_write> dst: array<i32>; + +struct Params { + offset_src: u32, // in elements + offset_dst: u32, // in elements + ne0: u32, +}; + +@group(0) @binding(2) +var<uniform> params: Params; + +const FLOAT_MIN: f32 = -1.0e9; + +struct Pair { + value: f32, + index: i32 +}; + +var<workgroup> shared_max: array<Pair, WG_SIZE>; + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(workgroup_id) wid: vec3<u32>, + @builtin(local_invocation_id) lid: vec3<u32>) { + let row_idx = params.offset_src + wid.x * params.ne0; + var local_pair = Pair(FLOAT_MIN, -1); +#ifdef VEC4 + for (var col = lid.x; col < params.ne0/VEC_SIZE; col += WG_SIZE) { + let vec_val = src[row_idx / VEC_SIZE + col]; + for (var v = 0u; v < VEC_SIZE; v++) { + let val = vec_val[v]; + if (val >= local_pair.value) { + local_pair = Pair(val, i32(col * VEC_SIZE + v)); + } + } + } +#else + for (var col = lid.x; col < params.ne0; col += WG_SIZE) { + if (src[row_idx + col] >= local_pair.value) { + local_pair = Pair(src[row_idx + col], i32(col)); + } + } +#endif + shared_max[lid.x] = local_pair; + workgroupBarrier(); + var offset: u32 = WG_SIZE >> 1; + while (offset > 0) { + if (lid.x < offset) { + let a = shared_max[lid.x]; + let b = shared_max[lid.x + offset]; + if (b.value > a.value) { + shared_max[lid.x] = b; + } else if (b.value == a.value && b.index > a.index) { + shared_max[lid.x] = b; + } + } + workgroupBarrier(); + offset >>= 1; + } + if (lid.x == 0u) { + dst[params.offset_dst + wid.x] = shared_max[0].index; + } +} diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl new file mode 100644 index 0000000..46ed19f --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl @@ -0,0 +1,106 @@ +@group(0) @binding(0) +var<storage, read_write> src: array<f32>; + +@group(0) @binding(1) +var<storage, read_write> dst: array<i32>; + +struct Params { + offset_src: u32, // in elements + offset_dst: u32, // in elements + + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + // src/dst dimensions + src_ne0: u32, + ne1: u32, + ne2: u32, + + ne0: u32, + top_k: u32, + + npr: u32, // tiles per row + nrows: u32 +}; + +@group(0) @binding(2) +var<uniform> params: Params; + +var<workgroup> shmem_idx: array<u32, WG_SIZE>; + +#if ORDER == 0 +#define EXTREME_VALUE 1e30 +#define SWAP_COMPARE_UP > +#define SWAP_COMPARE_DOWN < +#else +#define EXTREME_VALUE -1e30 +#define SWAP_COMPARE_UP < +#define SWAP_COMPARE_DOWN > +#endif + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(workgroup_id) wid: vec3<u32>, + @builtin(num_workgroups) num_wg: vec3<u32>, + @builtin(local_invocation_id) lid: vec3<u32>) { + let linear = wid.x + wid.y * num_wg.x; + // guard against overprovisioned workgroups + if (linear >= params.npr * params.nrows) { + return; + } + let tile = linear % params.npr; + var row = linear / params.npr; + let i3 = row / (params.ne2 * params.ne1); + row = row % (params.ne2 * params.ne1); + let i2 = row / params.ne1; + let i1 = row % params.ne1; + + let row_base = params.offset_src + + i1 * params.stride_src1 + + i2 * params.stride_src2 + + i3 * params.stride_src3; + + let tile_base = tile * WG_SIZE; + let idx = tile_base + lid.x; + shmem_idx[lid.x] = select(params.src_ne0, idx, idx < params.src_ne0); + workgroupBarrier(); + + var k = 2u; + while (k <= WG_SIZE) { + var j = k >> 1; + while (j > 0) { + let ixj = lid.x ^ j; + if (ixj > lid.x) { + let dir_up = (lid.x & k) == 0; + let a_idx = shmem_idx[lid.x]; + let b_idx = shmem_idx[ixj]; + let a_val = select(EXTREME_VALUE, src[row_base + a_idx], a_idx < params.src_ne0); + let b_val = select(EXTREME_VALUE, src[row_base + b_idx], b_idx < params.src_ne0); + let should_swap = select( + (a_val SWAP_COMPARE_DOWN b_val), + (a_val SWAP_COMPARE_UP b_val), + dir_up); + if (should_swap) { + shmem_idx[lid.x] = b_idx; + shmem_idx[ixj] = a_idx; + } + } + workgroupBarrier(); + j >>= 1; + } + k <<= 1; + } + + let out_idx = tile * params.top_k + lid.x; + if (out_idx < params.ne0 && lid.x < params.top_k) { + let row_dst = params.offset_dst + + i1 * params.stride_dst1 + + i2 * params.stride_dst2 + + i3 * params.stride_dst3; + dst[row_dst + out_idx] = i32(shmem_idx[lid.x]); + } +} diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl new file mode 100644 index 0000000..9a77f6e --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl @@ -0,0 +1,134 @@ +@group(0) @binding(0) +var<storage, read_write> src: array<f32>; + +@group(0) @binding(1) +var<storage, read_write> idx_in: array<i32>; + +@group(0) @binding(2) +var<storage, read_write> idx_out: array<i32>; + +struct Params { + offset_src: u32, // in elements + offset_in: u32, // in elements + offset_out: u32, // in elements + + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + stride_idx1: u32, + stride_idx2: u32, + stride_idx3: u32, + + stride_out1: u32, + stride_out2: u32, + stride_out3: u32, + + ne0: u32, + ne1: u32, + ne2: u32, + + top_k: u32, + + len: u32, + nm: u32, + nrows: u32 +}; + +@group(0) @binding(3) +var<uniform> params: Params; + +fn take_left(a_idx: i32, b_idx: i32, row_base: u32) -> bool { + let a_val = src[row_base + u32(a_idx)]; + let b_val = src[row_base + u32(b_idx)]; +#if ORDER == 0 + return a_val <= b_val; +#else + return a_val >= b_val; +#endif +} + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(workgroup_id) wid: vec3<u32>, + @builtin(num_workgroups) num_wg: vec3<u32>, + @builtin(local_invocation_id) lid: vec3<u32>) { + let linear = wid.x + wid.y * num_wg.x; + // guard against overprovisioned workgroups + if (linear >= params.nm * params.nrows) { + return; + } + + let start = (linear % params.nm) * params.len * 2; + let len0 = min(params.len, params.ne0 - start); + let rem1 = select(0, params.ne0 - (start + params.len), params.ne0 > (start + params.len)); + let len1 = min(params.len, rem1); + let total = len0 + len1; + let chunk = (total + WG_SIZE - 1u) / WG_SIZE; + let k0 = lid.x * chunk; + let k1 = min(min(k0 + chunk, total), params.top_k); + // guard against overprovisioned threads + if (k0 >= params.top_k || k0 >= total) { + return; + } + + var row = linear / params.nm; + let i3 = row / (params.ne2 * params.ne1); + row = row % (params.ne2 * params.ne1); + let i2 = row / params.ne1; + let i1 = row % params.ne1; + + let row_src = params.offset_src + + i1 * params.stride_src1 + + i2 * params.stride_src2 + + i3 * params.stride_src3; + + let row_in = params.offset_in + + i1 * params.stride_idx1 + + i2 * params.stride_idx2 + + i3 * params.stride_idx3; + + let row_out = params.offset_out + + i1 * params.stride_out1 + + i2 * params.stride_out2 + + i3 * params.stride_out3; + + + var low: u32 = select(0, k0 - len1, k0 > len1); + var high: u32 = min(k0, len0); + + while (low < high) { + let mid = (low + high) >> 1; + let idx0 = idx_in[row_in + start + mid]; + let idx1 = idx_in[row_in + start + params.len + (k0 - mid - 1)]; + if (take_left(idx0, idx1, row_src)) { + low = mid + 1; + } else { + high = mid; + } + } + + var i = low; + var j = k0 - i; + var k = k0; + while (k < k1) { + var take_l = false; + if (i >= len0) { + take_l = false; + } else if (j >= len1) { + take_l = true; + } else { + let idx0 = idx_in[row_in + start + i]; + let idx1 = idx_in[row_in + start + params.len + j]; + take_l = take_left(idx0, idx1, row_src); + } + + let out_idx = select( + idx_in[row_in + start + params.len + j], + idx_in[row_in + start + i], + take_l); + idx_out[row_out + start + k] = out_idx; + i = select(i, i + 1, take_l); + j = select(j + 1, j, take_l); + k += 1; + } +} diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl new file mode 100644 index 0000000..55dd664 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl @@ -0,0 +1,107 @@ +enable f16; + +struct Params { + ne: u32, + + // offsets in elements + offset_src0: u32, + offset_src1: u32, + offset_dst: u32, + + stride_src1_0: u32, + stride_src1_1: u32, + stride_src1_2: u32, + stride_src1_3: u32, + + a_ne0: u32, + a_ne1: u32, + a_ne2: u32, + + b_ne0: u32, + b_ne1: u32, + b_ne2: u32, + b_ne3: u32, +}; + +fn src1_index(_i: u32) -> u32 { + var i = _i; + let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0); + i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0); + let a_i2 = i / (params.a_ne1 * params.a_ne0); + i = i % (params.a_ne1 * params.a_ne0); + let a_i1 = i / params.a_ne0; + let a_i0 = i % params.a_ne0; + + // handle repetition of b + // index loops back to the beginning and repeats after elements are exhausted = modulo + let b_i0 = a_i0 % params.b_ne0; + let b_i1 = a_i1 % params.b_ne1; + let b_i2 = a_i2 % params.b_ne2; + let b_i3 = a_i3 % params.b_ne3; + + // compute index for position in b's flat array + return b_i0 * params.stride_src1_0 + + b_i1 * params.stride_src1_1 + + b_i2 * params.stride_src1_2 + + b_i3 * params.stride_src1_3; +} + +#ifdef TYPE_F32 +#define DataType f32 +#endif +#ifdef TYPE_F16 +#define DataType f16 +#endif + +@group(0) @binding(0) +var<storage, read_write> src0: array<DataType>; + +@group(0) @binding(1) +var<storage, read_write> src1 : array<DataType>; + +#ifdef INPLACE +@group(0) @binding(2) +var<uniform> params: Params; + +#elif defined(OVERLAP) +@group(0) @binding(2) +var<uniform> params: Params; + +#else +@group(0) @binding(2) +var<storage, read_write> dst: array<DataType>; + +@group(0) @binding(3) +var<uniform> params: Params; +#endif + +fn op(a: DataType, b: DataType) -> DataType { +#ifdef OP_ADD + return a + b; +#elif defined(OP_SUB) + return a - b; +#elif defined(OP_MUL) + return a * b; +#elif defined(OP_DIV) + return a / b; +#endif +} + +fn update(dst_i: u32, src0_i: u32, src1_i: u32){ + let result = op(src0[src0_i], src1[src1_i]); + +#ifdef INPLACE + src0[dst_i] = result; +#elif defined(OVERLAP) + src1[dst_i] = result; +#else + dst[dst_i] = result; +#endif +} + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(global_invocation_id) gid: vec3<u32>) { + if (gid.x < params.ne) { + update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x)); + } +} diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl new file mode 100644 index 0000000..389c97b --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl @@ -0,0 +1,930 @@ +#decl(BYTE_HELPERS) + +fn get_byte(value: u32, index: u32) -> u32 { + return (value >> (index * 8)) & 0xFF; +} + +fn get_byte_i32(value: u32, index: u32) -> i32 { + return bitcast<i32>(((value >> (index * 8)) & 0xFF) << 24) >> 24; +} + +#enddecl(BYTE_HELPERS) + +#decl(Q4_0_T) +struct q4_0 { + d: f16, + qs: array<f16, 8> +}; +#enddecl(Q4_0_T) + +#decl(Q4_1_T) +struct q4_1 { + d: f16, + m: f16, + qs: array<u32, 4> +}; +#enddecl(Q4_1_T) + +#decl(Q5_0_T) +struct q5_0 { + d: f16, + qh: array<f16, 2>, + qs: array<f16, 8> +}; +#enddecl(Q5_0_T) + +#decl(Q5_1_T) +struct q5_1 { + d: f16, + m: f16, + qh: u32, + qs: array<u32, 4> +}; +#enddecl(Q5_1_T) + +#decl(Q8_0_T) +struct q8_0 { + d: f16, + qs: array<f16, 16> +}; +#enddecl(Q8_0_T) + +#decl(Q8_1_T) +struct q8_1 { + d: f16, + m: f16, + qs: array<u32, 8> +}; +#enddecl(Q8_1_T) + +#decl(Q2_K_T) +struct q2_k { + scales: array<u32, 4>, + qs: array<u32, 16>, + d: f16, + dmin: f16 +}; +#enddecl(Q2_K_T) + +#decl(Q3_K_T) +struct q3_k { + hmask: array<f16, 16>, + qs: array<f16, 32>, + scales: array<f16, 6>, + d: f16 +}; +#enddecl(Q3_K_T) + +#decl(Q45_K_SCALE_MIN) + +fn get_scale_min(is: u32, scales: array<u32, 3>) -> vec2<f32> { + if (is < 4) { + let sc_byte = get_byte(scales[is / 4], is % 4); + let min_byte = get_byte(scales[(is + 4) / 4], is % 4); + return vec2(f32(sc_byte & 63), f32(min_byte & 63)); + } else { + let sc_min_lo = get_byte(scales[(is + 4) / 4], (is + 4) % 4); + let sc_hi = get_byte(scales[(is - 4) / 4], (is - 4) % 4); + let min_hi = get_byte(scales[is / 4], is % 4); + let sc = (sc_min_lo & 0xF) | ((sc_hi >> 6) << 4); + let m = (sc_min_lo >> 4) | ((min_hi >> 6) << 4); + return vec2(f32(sc), f32(m)); + } +} + +#enddecl(Q45_K_SCALE_MIN) + +#decl(Q4_K_T) +struct q4_k { + d: f16, + dmin: f16, + scales: array<u32, 3>, + qs: array<u32, 32> +}; +#enddecl(Q4_K_T) + +#decl(Q5_K_T) +struct q5_k { + d: f16, + dmin: f16, + scales: array<u32, 3>, + qh: array<u32, 8>, + qs: array<u32, 32> +}; +#enddecl(Q5_K_T) + +#decl(Q6_K_T) +struct q6_k { + ql: array<f16, 64>, + qh: array<f16, 32>, + scales: array<f16, 8>, + d: f16 +}; +#enddecl(Q6_K_T) + +#decl(IQ2_XXS_T) +struct iq2_xxs { + d: f16, + qs: array<f16, 32> +}; +#enddecl(IQ2_XXS_T) + +#decl(IQ2_XS_T) +struct iq2_xs { + d: f16, + qs: array<f16, 32>, + scales: array<f16, 4> +}; +#enddecl(IQ2_XS_T) + +#decl(IQ2_S_T) +struct iq2_s { + d: f16, + qs: array<f16, 32>, + qh: array<f16, 4>, + scales: array<f16, 4> +}; +#enddecl(IQ2_S_T) + +#decl(IQ3_XSS_T) +struct iq3_xxs { + d: f16, + qs: array<f16, 48> +}; +#enddecl(IQ3_XSS_T) + +#decl(IQ3_S_T) +struct iq3_s { + d: f16, + qs: array<f16, 32>, + qh: array<f16, 4>, + signs: array<f16, 16>, + scales: array<f16, 2> +}; +#enddecl(IQ3_S_T) + +#decl(IQ1_S_T) +struct iq1_s { + d: f16, + qs: array<f16, 16>, + qh: array<f16, 8> +}; +#enddecl(IQ1_S_T) + +#decl(IQ1_M_T) +struct iq1_m { + qs: array<u32, 8>, + qh: array<u32, 4>, + scales: array<u32, 2> +}; +#enddecl(IQ1_M_T) + +#decl(IQ4_NL_T) +struct iq4_nl { + d: f16, + qs: array<f16, 8>, +}; +#enddecl(IQ4_NL_T) + +#decl(IQ4_XS_T) +struct iq4_xs { + d: f16, + scales_h: f16, + scales_l: u32, + qs: array<u32, 32> +}; +#enddecl(IQ4_XS_T) + +#decl(IQ23_TABLES) +const kmask_iq2xs : array<u32, 2> = array<u32, 2>( + 0x08040201u, // 1, 2, 4, 8 + 0x80402010u // 16, 32, 64, 128 +); + +const ksigns_iq2xs: array<u32, 32> = array<u32, 32>( + 0x03828100,0x87060584,0x8b0a0988,0x0f8e8d0c, + 0x93121190,0x17969514,0x1b9a9918,0x9f1e1d9c, + 0xa32221a0,0x27a6a524,0x2baaa928,0xaf2e2dac, + 0x33b2b130,0xb73635b4,0xbb3a39b8,0x3fbebd3c, + 0xc34241c0,0x47c6c544,0x4bcac948,0xcf4e4dcc, + 0x53d2d150,0xd75655d4,0xdb5a59d8,0x5fdedd5c, + 0x63e2e160,0xe76665e4,0xeb6a69e8,0x6feeed6c, + 0xf37271f0,0x77f6f574,0x7bfaf978,0xff7e7dfc +); +#enddecl(IQ23_TABLES) + +#decl(IQ2_XXS_GRID) +const iq2xxs_grid = array<u32, 512>( + 0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808, + 0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x082b0808, 0x08080808, + 0x082b082b, 0x08080808, 0x082b2b08, 0x08080808, 0x082b2b2b, 0x08080808, 0x19080819, 0x08080808, + 0x19081908, 0x08080808, 0x19190808, 0x08080808, 0x19192b08, 0x08080808, 0x192b0819, 0x08080808, + 0x192b1908, 0x08080808, 0x2b080808, 0x08080808, 0x2b08082b, 0x08080808, 0x2b082b2b, 0x08080808, + 0x2b2b082b, 0x08080808, 0x08080819, 0x08080819, 0x08081908, 0x08080819, 0x08190808, 0x08080819, + 0x08191919, 0x08080819, 0x19080808, 0x08080819, 0x2b081908, 0x08080819, 0x2b192b08, 0x08080819, + 0x08080808, 0x0808082b, 0x0808082b, 0x0808082b, 0x082b082b, 0x0808082b, 0x2b08082b, 0x0808082b, + 0x08080819, 0x08081908, 0x08081908, 0x08081908, 0x08190808, 0x08081908, 0x082b0819, 0x08081908, + 0x082b1908, 0x08081908, 0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19082b08, 0x08081908, + 0x192b0808, 0x08081908, 0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b190808, 0x08081908, + 0x2b2b1908, 0x08081908, 0x08080808, 0x08081919, 0x0808082b, 0x08081919, 0x08082b08, 0x08081919, + 0x082b0808, 0x08081919, 0x1908192b, 0x08081919, 0x192b2b19, 0x08081919, 0x2b080808, 0x08081919, + 0x2b190819, 0x08081919, 0x08082b19, 0x0808192b, 0x08190808, 0x0808192b, 0x19080808, 0x0808192b, + 0x2b081908, 0x0808192b, 0x2b2b1908, 0x0808192b, 0x08080808, 0x08082b08, 0x08081919, 0x08082b08, + 0x08082b08, 0x08082b08, 0x08191908, 0x08082b08, 0x082b2b08, 0x08082b08, 0x19080819, 0x08082b08, + 0x19081908, 0x08082b08, 0x19190808, 0x08082b08, 0x1919082b, 0x08082b08, 0x2b082b08, 0x08082b08, + 0x08081908, 0x08082b19, 0x19080808, 0x08082b19, 0x0808082b, 0x08082b2b, 0x08191908, 0x08082b2b, + 0x08080819, 0x08190808, 0x08081908, 0x08190808, 0x08190808, 0x08190808, 0x082b0819, 0x08190808, + 0x19080808, 0x08190808, 0x192b0808, 0x08190808, 0x2b081908, 0x08190808, 0x2b190808, 0x08190808, + 0x2b191919, 0x08190808, 0x08080808, 0x08190819, 0x08082b08, 0x08190819, 0x082b0808, 0x08190819, + 0x19190808, 0x08190819, 0x19192b2b, 0x08190819, 0x2b080808, 0x08190819, 0x082b1908, 0x0819082b, + 0x19081919, 0x0819082b, 0x08080808, 0x08191908, 0x08082b08, 0x08191908, 0x082b0808, 0x08191908, + 0x082b1919, 0x08191908, 0x19082b19, 0x08191908, 0x2b080808, 0x08191908, 0x08192b08, 0x08191919, + 0x192b082b, 0x08191919, 0x08080808, 0x0819192b, 0x0819192b, 0x0819192b, 0x08080819, 0x08192b08, + 0x08081908, 0x08192b08, 0x08190808, 0x08192b08, 0x19080808, 0x08192b08, 0x2b080819, 0x08192b08, + 0x08080808, 0x08192b19, 0x08081919, 0x08192b19, 0x2b2b0808, 0x08192b19, 0x19190819, 0x08192b2b, + 0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08082b2b, 0x082b0808, 0x19081908, 0x082b0808, + 0x192b0819, 0x082b0808, 0x2b080808, 0x082b0808, 0x2b08082b, 0x082b0808, 0x082b2b19, 0x082b0819, + 0x19082b08, 0x082b0819, 0x08080808, 0x082b082b, 0x0808082b, 0x082b082b, 0x08080819, 0x082b1908, + 0x08081908, 0x082b1908, 0x08190808, 0x082b1908, 0x19080808, 0x082b1908, 0x1919192b, 0x082b1908, + 0x08080808, 0x082b1919, 0x19080819, 0x082b1919, 0x192b1908, 0x082b1919, 0x2b190808, 0x082b192b, + 0x08082b08, 0x082b2b08, 0x082b0808, 0x082b2b08, 0x2b191908, 0x082b2b08, 0x19081908, 0x082b2b2b, + 0x08080819, 0x19080808, 0x08081908, 0x19080808, 0x08190808, 0x19080808, 0x08192b08, 0x19080808, + 0x082b0819, 0x19080808, 0x082b1908, 0x19080808, 0x19080808, 0x19080808, 0x19082b08, 0x19080808, + 0x1919192b, 0x19080808, 0x192b0808, 0x19080808, 0x2b080819, 0x19080808, 0x2b081908, 0x19080808, + 0x2b190808, 0x19080808, 0x08080808, 0x19080819, 0x082b0808, 0x19080819, 0x192b0819, 0x19080819, + 0x2b080808, 0x19080819, 0x2b081919, 0x19080819, 0x08080819, 0x1908082b, 0x08190808, 0x1908082b, + 0x19082b08, 0x1908082b, 0x1919192b, 0x1908082b, 0x192b2b08, 0x1908082b, 0x08080808, 0x19081908, + 0x08082b08, 0x19081908, 0x082b0808, 0x19081908, 0x2b080808, 0x19081908, 0x2b192b19, 0x19081908, + 0x0819082b, 0x19081919, 0x082b1908, 0x19081919, 0x08080808, 0x1908192b, 0x08080819, 0x19082b08, + 0x08081908, 0x19082b08, 0x08190808, 0x19082b08, 0x19080808, 0x19082b08, 0x19081919, 0x19082b08, + 0x08080808, 0x19082b19, 0x19192b08, 0x19082b19, 0x192b0819, 0x19082b19, 0x2b08082b, 0x19082b19, + 0x19081919, 0x19082b2b, 0x2b190808, 0x19082b2b, 0x08080808, 0x19190808, 0x08082b08, 0x19190808, + 0x08190819, 0x19190808, 0x08192b19, 0x19190808, 0x082b0808, 0x19190808, 0x2b080808, 0x19190808, + 0x2b082b08, 0x19190808, 0x08081908, 0x19190819, 0x1908082b, 0x19190819, 0x2b2b1908, 0x19190819, + 0x2b190819, 0x1919082b, 0x2b190808, 0x19191908, 0x2b19082b, 0x19191908, 0x08082b2b, 0x19191919, + 0x08080819, 0x1919192b, 0x19191908, 0x1919192b, 0x08080808, 0x19192b08, 0x08190819, 0x19192b08, + 0x08192b19, 0x19192b08, 0x192b1908, 0x19192b08, 0x19080808, 0x19192b19, 0x08082b08, 0x19192b2b, + 0x08081908, 0x192b0808, 0x08190808, 0x192b0808, 0x19080808, 0x192b0808, 0x192b2b08, 0x192b0808, + 0x08080808, 0x192b0819, 0x19191919, 0x192b0819, 0x08192b08, 0x192b082b, 0x192b0808, 0x192b082b, + 0x08080808, 0x192b1908, 0x08081919, 0x192b1908, 0x08190808, 0x192b1919, 0x0819082b, 0x192b1919, + 0x2b081908, 0x192b1919, 0x1908082b, 0x192b2b08, 0x08080808, 0x2b080808, 0x0808082b, 0x2b080808, + 0x08082b2b, 0x2b080808, 0x19080819, 0x2b080808, 0x2b08082b, 0x2b080808, 0x08081908, 0x2b080819, + 0x08192b08, 0x2b080819, 0x19080808, 0x2b080819, 0x08190819, 0x2b08082b, 0x08080819, 0x2b081908, + 0x08081908, 0x2b081908, 0x08190808, 0x2b081908, 0x08191919, 0x2b081908, 0x19080808, 0x2b081908, + 0x192b0808, 0x2b081908, 0x08080808, 0x2b081919, 0x1908192b, 0x2b081919, 0x2b191908, 0x2b081919, + 0x08082b19, 0x2b08192b, 0x19080808, 0x2b08192b, 0x192b0808, 0x2b08192b, 0x0808082b, 0x2b082b08, + 0x08081908, 0x2b082b19, 0x08190819, 0x2b082b2b, 0x08081908, 0x2b190808, 0x08190808, 0x2b190808, + 0x082b1908, 0x2b190808, 0x19080808, 0x2b190808, 0x2b2b0819, 0x2b190808, 0x0819192b, 0x2b190819, + 0x2b080808, 0x2b190819, 0x19081919, 0x2b19082b, 0x08080808, 0x2b191908, 0x082b082b, 0x2b191908, + 0x19081908, 0x2b191908, 0x19190819, 0x2b191919, 0x2b080819, 0x2b192b08, 0x082b0808, 0x2b192b19, + 0x0808082b, 0x2b2b0808, 0x19190808, 0x2b2b0808, 0x2b081919, 0x2b2b0808, 0x08082b19, 0x2b2b0819, + 0x08080808, 0x2b2b082b, 0x08192b08, 0x2b2b1908, 0x19190808, 0x2b2b2b08, 0x08081908, 0x2b2b2b19 +); +#enddecl(IQ2_XXS_GRID) + +#decl(IQ2_XS_GRID) +const iq2xs_grid = array<u32, 1024>( + 0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808, + 0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x0819192b, 0x08080808, + 0x08192b19, 0x08080808, 0x082b0808, 0x08080808, 0x082b082b, 0x08080808, 0x082b1919, 0x08080808, + 0x082b2b08, 0x08080808, 0x19080819, 0x08080808, 0x19081908, 0x08080808, 0x1908192b, 0x08080808, + 0x19082b19, 0x08080808, 0x19190808, 0x08080808, 0x1919082b, 0x08080808, 0x19191919, 0x08080808, + 0x19192b08, 0x08080808, 0x192b0819, 0x08080808, 0x192b1908, 0x08080808, 0x2b080808, 0x08080808, + 0x2b08082b, 0x08080808, 0x2b081919, 0x08080808, 0x2b082b08, 0x08080808, 0x2b190819, 0x08080808, + 0x2b191908, 0x08080808, 0x2b192b19, 0x08080808, 0x2b2b0808, 0x08080808, 0x08080819, 0x08080819, + 0x08081908, 0x08080819, 0x0808192b, 0x08080819, 0x08082b19, 0x08080819, 0x08190808, 0x08080819, + 0x0819082b, 0x08080819, 0x08191919, 0x08080819, 0x08192b08, 0x08080819, 0x08192b2b, 0x08080819, + 0x082b0819, 0x08080819, 0x082b1908, 0x08080819, 0x19080808, 0x08080819, 0x1908082b, 0x08080819, + 0x19081919, 0x08080819, 0x19082b08, 0x08080819, 0x19190819, 0x08080819, 0x19191908, 0x08080819, + 0x192b0808, 0x08080819, 0x192b2b08, 0x08080819, 0x2b080819, 0x08080819, 0x2b081908, 0x08080819, + 0x2b190808, 0x08080819, 0x08080808, 0x0808082b, 0x0808082b, 0x0808082b, 0x08081919, 0x0808082b, + 0x08082b08, 0x0808082b, 0x08190819, 0x0808082b, 0x08191908, 0x0808082b, 0x082b0808, 0x0808082b, + 0x19080819, 0x0808082b, 0x19081908, 0x0808082b, 0x19190808, 0x0808082b, 0x19191919, 0x0808082b, + 0x2b080808, 0x0808082b, 0x2b082b2b, 0x0808082b, 0x08080819, 0x08081908, 0x08081908, 0x08081908, + 0x0808192b, 0x08081908, 0x08082b19, 0x08081908, 0x08190808, 0x08081908, 0x0819082b, 0x08081908, + 0x08191919, 0x08081908, 0x08192b08, 0x08081908, 0x082b0819, 0x08081908, 0x082b1908, 0x08081908, + 0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19081919, 0x08081908, 0x19082b08, 0x08081908, + 0x19190819, 0x08081908, 0x19191908, 0x08081908, 0x1919192b, 0x08081908, 0x192b0808, 0x08081908, + 0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b190808, 0x08081908, 0x08080808, 0x08081919, + 0x0808082b, 0x08081919, 0x08081919, 0x08081919, 0x08082b08, 0x08081919, 0x08190819, 0x08081919, + 0x08191908, 0x08081919, 0x082b0808, 0x08081919, 0x19080819, 0x08081919, 0x19081908, 0x08081919, + 0x19190808, 0x08081919, 0x192b0819, 0x08081919, 0x2b080808, 0x08081919, 0x08080819, 0x0808192b, + 0x08081908, 0x0808192b, 0x08190808, 0x0808192b, 0x082b192b, 0x0808192b, 0x19080808, 0x0808192b, + 0x1908082b, 0x0808192b, 0x2b081908, 0x0808192b, 0x08080808, 0x08082b08, 0x0808082b, 0x08082b08, + 0x08081919, 0x08082b08, 0x08082b08, 0x08082b08, 0x08082b2b, 0x08082b08, 0x08190819, 0x08082b08, + 0x08191908, 0x08082b08, 0x082b0808, 0x08082b08, 0x082b1919, 0x08082b08, 0x19080819, 0x08082b08, + 0x19081908, 0x08082b08, 0x19190808, 0x08082b08, 0x19192b08, 0x08082b08, 0x2b080808, 0x08082b08, + 0x2b2b0808, 0x08082b08, 0x2b2b2b2b, 0x08082b08, 0x08080819, 0x08082b19, 0x08081908, 0x08082b19, + 0x08190808, 0x08082b19, 0x19080808, 0x08082b19, 0x2b080819, 0x08082b19, 0x2b082b19, 0x08082b19, + 0x08080808, 0x08082b2b, 0x082b0808, 0x08082b2b, 0x082b2b08, 0x08082b2b, 0x2b19192b, 0x08082b2b, + 0x2b2b0808, 0x08082b2b, 0x08080819, 0x08190808, 0x08081908, 0x08190808, 0x0808192b, 0x08190808, + 0x08082b19, 0x08190808, 0x08190808, 0x08190808, 0x0819082b, 0x08190808, 0x08191919, 0x08190808, + 0x08192b08, 0x08190808, 0x082b0819, 0x08190808, 0x082b1908, 0x08190808, 0x19080808, 0x08190808, + 0x1908082b, 0x08190808, 0x19081919, 0x08190808, 0x19082b08, 0x08190808, 0x19190819, 0x08190808, + 0x19191908, 0x08190808, 0x192b0808, 0x08190808, 0x192b2b2b, 0x08190808, 0x2b080819, 0x08190808, + 0x2b081908, 0x08190808, 0x2b190808, 0x08190808, 0x08080808, 0x08190819, 0x0808082b, 0x08190819, + 0x08081919, 0x08190819, 0x08082b08, 0x08190819, 0x08190819, 0x08190819, 0x08191908, 0x08190819, + 0x082b0808, 0x08190819, 0x19080819, 0x08190819, 0x19081908, 0x08190819, 0x19190808, 0x08190819, + 0x2b080808, 0x08190819, 0x2b191908, 0x08190819, 0x2b19192b, 0x08190819, 0x08080819, 0x0819082b, + 0x08081908, 0x0819082b, 0x0808192b, 0x0819082b, 0x08190808, 0x0819082b, 0x19080808, 0x0819082b, + 0x192b0808, 0x0819082b, 0x08080808, 0x08191908, 0x0808082b, 0x08191908, 0x08081919, 0x08191908, + 0x08082b08, 0x08191908, 0x08190819, 0x08191908, 0x08191908, 0x08191908, 0x082b0808, 0x08191908, + 0x19080819, 0x08191908, 0x19081908, 0x08191908, 0x19082b19, 0x08191908, 0x19190808, 0x08191908, + 0x192b1908, 0x08191908, 0x2b080808, 0x08191908, 0x08080819, 0x08191919, 0x08081908, 0x08191919, + 0x08190808, 0x08191919, 0x19080808, 0x08191919, 0x08080808, 0x0819192b, 0x08191908, 0x0819192b, + 0x19082b19, 0x0819192b, 0x08080819, 0x08192b08, 0x08081908, 0x08192b08, 0x08190808, 0x08192b08, + 0x0819082b, 0x08192b08, 0x19080808, 0x08192b08, 0x19191908, 0x08192b08, 0x2b08192b, 0x08192b08, + 0x08080808, 0x08192b19, 0x08081919, 0x08192b19, 0x192b192b, 0x08192b19, 0x19190819, 0x08192b2b, + 0x2b2b2b19, 0x08192b2b, 0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08081919, 0x082b0808, + 0x08082b08, 0x082b0808, 0x08082b2b, 0x082b0808, 0x08190819, 0x082b0808, 0x08191908, 0x082b0808, + 0x082b0808, 0x082b0808, 0x19080819, 0x082b0808, 0x19081908, 0x082b0808, 0x19190808, 0x082b0808, + 0x2b080808, 0x082b0808, 0x2b2b0808, 0x082b0808, 0x08080819, 0x082b0819, 0x08081908, 0x082b0819, + 0x08190808, 0x082b0819, 0x19080808, 0x082b0819, 0x19082b08, 0x082b0819, 0x192b1919, 0x082b0819, + 0x08080808, 0x082b082b, 0x082b082b, 0x082b082b, 0x2b080808, 0x082b082b, 0x2b2b2b08, 0x082b082b, + 0x08080819, 0x082b1908, 0x08081908, 0x082b1908, 0x08190808, 0x082b1908, 0x082b2b19, 0x082b1908, + 0x19080808, 0x082b1908, 0x08080808, 0x082b1919, 0x19080819, 0x082b1919, 0x1919082b, 0x082b1919, + 0x2b192b19, 0x082b1919, 0x08080819, 0x082b192b, 0x08192b2b, 0x082b192b, 0x2b2b192b, 0x082b192b, + 0x08080808, 0x082b2b08, 0x08082b08, 0x082b2b08, 0x08082b2b, 0x082b2b08, 0x082b0808, 0x082b2b08, + 0x19191919, 0x082b2b08, 0x2b082b08, 0x082b2b08, 0x2b2b082b, 0x082b2b08, 0x192b2b08, 0x082b2b19, + 0x2b190808, 0x082b2b19, 0x08082b08, 0x082b2b2b, 0x082b0808, 0x082b2b2b, 0x2b08082b, 0x082b2b2b, + 0x2b082b08, 0x082b2b2b, 0x2b082b2b, 0x082b2b2b, 0x08080819, 0x19080808, 0x08081908, 0x19080808, + 0x0808192b, 0x19080808, 0x08082b19, 0x19080808, 0x08190808, 0x19080808, 0x0819082b, 0x19080808, + 0x08191919, 0x19080808, 0x08192b08, 0x19080808, 0x082b0819, 0x19080808, 0x082b1908, 0x19080808, + 0x19080808, 0x19080808, 0x1908082b, 0x19080808, 0x19081919, 0x19080808, 0x19082b08, 0x19080808, + 0x19082b2b, 0x19080808, 0x19190819, 0x19080808, 0x19191908, 0x19080808, 0x192b0808, 0x19080808, + 0x192b1919, 0x19080808, 0x2b080819, 0x19080808, 0x2b081908, 0x19080808, 0x2b190808, 0x19080808, + 0x08080808, 0x19080819, 0x0808082b, 0x19080819, 0x08081919, 0x19080819, 0x08082b08, 0x19080819, + 0x08190819, 0x19080819, 0x08191908, 0x19080819, 0x082b0808, 0x19080819, 0x19080819, 0x19080819, + 0x19081908, 0x19080819, 0x19190808, 0x19080819, 0x2b080808, 0x19080819, 0x2b081919, 0x19080819, + 0x2b2b082b, 0x19080819, 0x08080819, 0x1908082b, 0x08081908, 0x1908082b, 0x08190808, 0x1908082b, + 0x0819082b, 0x1908082b, 0x082b2b19, 0x1908082b, 0x19080808, 0x1908082b, 0x08080808, 0x19081908, + 0x0808082b, 0x19081908, 0x08081919, 0x19081908, 0x08082b08, 0x19081908, 0x08190819, 0x19081908, + 0x08191908, 0x19081908, 0x08192b19, 0x19081908, 0x082b0808, 0x19081908, 0x19080819, 0x19081908, + 0x19081908, 0x19081908, 0x19190808, 0x19081908, 0x2b080808, 0x19081908, 0x2b191908, 0x19081908, + 0x08080819, 0x19081919, 0x08081908, 0x19081919, 0x08190808, 0x19081919, 0x082b1908, 0x19081919, + 0x19080808, 0x19081919, 0x2b192b2b, 0x19081919, 0x08080808, 0x1908192b, 0x08082b2b, 0x1908192b, + 0x19081908, 0x1908192b, 0x19190808, 0x1908192b, 0x08080819, 0x19082b08, 0x08081908, 0x19082b08, + 0x08190808, 0x19082b08, 0x19080808, 0x19082b08, 0x19081919, 0x19082b08, 0x19191908, 0x19082b08, + 0x192b082b, 0x19082b08, 0x08080808, 0x19082b19, 0x08190819, 0x19082b19, 0x19081908, 0x19082b19, + 0x19190808, 0x19082b19, 0x192b2b19, 0x19082b19, 0x08081908, 0x19082b2b, 0x08080808, 0x19190808, + 0x0808082b, 0x19190808, 0x08081919, 0x19190808, 0x08082b08, 0x19190808, 0x08190819, 0x19190808, + 0x08191908, 0x19190808, 0x082b0808, 0x19190808, 0x082b2b08, 0x19190808, 0x19080819, 0x19190808, + 0x19081908, 0x19190808, 0x19190808, 0x19190808, 0x2b080808, 0x19190808, 0x08080819, 0x19190819, + 0x08081908, 0x19190819, 0x08190808, 0x19190819, 0x08191919, 0x19190819, 0x19080808, 0x19190819, + 0x1908082b, 0x19190819, 0x08080808, 0x1919082b, 0x19081908, 0x1919082b, 0x2b2b2b2b, 0x1919082b, + 0x08080819, 0x19191908, 0x08081908, 0x19191908, 0x08190808, 0x19191908, 0x082b0819, 0x19191908, + 0x19080808, 0x19191908, 0x192b0808, 0x19191908, 0x2b080819, 0x19191908, 0x2b2b0819, 0x19191908, + 0x08080808, 0x19191919, 0x08082b08, 0x19191919, 0x2b080808, 0x19191919, 0x2b082b08, 0x19191919, + 0x082b0819, 0x1919192b, 0x192b2b08, 0x1919192b, 0x2b2b0819, 0x1919192b, 0x08080808, 0x19192b08, + 0x08191908, 0x19192b08, 0x19080819, 0x19192b08, 0x19190808, 0x19192b08, 0x2b192b19, 0x19192b08, + 0x08192b2b, 0x19192b19, 0x19080808, 0x19192b19, 0x1908082b, 0x19192b19, 0x2b081919, 0x19192b2b, + 0x08080819, 0x192b0808, 0x08081908, 0x192b0808, 0x08190808, 0x192b0808, 0x19080808, 0x192b0808, + 0x19191908, 0x192b0808, 0x192b082b, 0x192b0808, 0x2b08192b, 0x192b0808, 0x2b2b2b19, 0x192b0808, + 0x08080808, 0x192b0819, 0x082b1908, 0x192b082b, 0x19082b2b, 0x192b082b, 0x2b19082b, 0x192b082b, + 0x08080808, 0x192b1908, 0x0819192b, 0x192b1908, 0x08190808, 0x192b1919, 0x19080808, 0x192b1919, + 0x19081919, 0x192b1919, 0x2b2b1908, 0x192b1919, 0x08080819, 0x192b2b08, 0x192b2b2b, 0x192b2b08, + 0x082b1919, 0x192b2b19, 0x0808192b, 0x192b2b2b, 0x19191908, 0x192b2b2b, 0x192b082b, 0x192b2b2b, + 0x08080808, 0x2b080808, 0x0808082b, 0x2b080808, 0x08081919, 0x2b080808, 0x08082b08, 0x2b080808, + 0x08190819, 0x2b080808, 0x08191908, 0x2b080808, 0x082b0808, 0x2b080808, 0x082b2b2b, 0x2b080808, + 0x19080819, 0x2b080808, 0x19081908, 0x2b080808, 0x19190808, 0x2b080808, 0x2b080808, 0x2b080808, + 0x2b08082b, 0x2b080808, 0x2b2b2b08, 0x2b080808, 0x2b2b2b2b, 0x2b080808, 0x08080819, 0x2b080819, + 0x08081908, 0x2b080819, 0x0808192b, 0x2b080819, 0x08190808, 0x2b080819, 0x19080808, 0x2b080819, + 0x19190819, 0x2b080819, 0x19192b19, 0x2b080819, 0x08080808, 0x2b08082b, 0x082b0808, 0x2b08082b, + 0x2b080808, 0x2b08082b, 0x2b08082b, 0x2b08082b, 0x2b2b0808, 0x2b08082b, 0x2b2b2b08, 0x2b08082b, + 0x08080819, 0x2b081908, 0x08081908, 0x2b081908, 0x08190808, 0x2b081908, 0x0819082b, 0x2b081908, + 0x08191919, 0x2b081908, 0x19080808, 0x2b081908, 0x192b0808, 0x2b081908, 0x2b082b19, 0x2b081908, + 0x08080808, 0x2b081919, 0x19081908, 0x2b081919, 0x2b2b1919, 0x2b081919, 0x08192b08, 0x2b08192b, + 0x192b2b2b, 0x2b08192b, 0x08080808, 0x2b082b08, 0x08082b08, 0x2b082b08, 0x082b1919, 0x2b082b08, + 0x19192b2b, 0x2b082b08, 0x2b080808, 0x2b082b08, 0x2b08082b, 0x2b082b08, 0x2b2b2b08, 0x2b082b08, + 0x0808192b, 0x2b082b19, 0x082b082b, 0x2b082b2b, 0x2b080808, 0x2b082b2b, 0x2b082b08, 0x2b082b2b, + 0x2b19192b, 0x2b082b2b, 0x2b2b2b08, 0x2b082b2b, 0x08080819, 0x2b190808, 0x08081908, 0x2b190808, + 0x08190808, 0x2b190808, 0x19080808, 0x2b190808, 0x1919192b, 0x2b190808, 0x2b081908, 0x2b190808, + 0x08080808, 0x2b190819, 0x082b082b, 0x2b190819, 0x192b1908, 0x2b190819, 0x1919192b, 0x2b19082b, + 0x2b082b19, 0x2b19082b, 0x08080808, 0x2b191908, 0x08081919, 0x2b191908, 0x19081908, 0x2b191908, + 0x19190808, 0x2b191908, 0x19192b08, 0x2b191908, 0x082b2b19, 0x2b191919, 0x2b190808, 0x2b191919, + 0x2b19082b, 0x2b191919, 0x19080819, 0x2b19192b, 0x19190819, 0x2b192b08, 0x2b2b192b, 0x2b192b08, + 0x19082b19, 0x2b192b19, 0x08191919, 0x2b192b2b, 0x192b0808, 0x2b192b2b, 0x08080808, 0x2b2b0808, + 0x0808082b, 0x2b2b0808, 0x08082b08, 0x2b2b0808, 0x08082b2b, 0x2b2b0808, 0x082b0808, 0x2b2b0808, + 0x082b2b2b, 0x2b2b0808, 0x2b2b0808, 0x2b2b0808, 0x19190819, 0x2b2b0819, 0x19192b19, 0x2b2b0819, + 0x2b2b192b, 0x2b2b0819, 0x08080808, 0x2b2b082b, 0x0808082b, 0x2b2b082b, 0x08082b08, 0x2b2b082b, + 0x082b2b2b, 0x2b2b082b, 0x2b080808, 0x2b2b082b, 0x2b2b0808, 0x2b2b082b, 0x19080808, 0x2b2b1908, + 0x2b191919, 0x2b2b1908, 0x192b1919, 0x2b2b192b, 0x2b192b08, 0x2b2b192b, 0x08082b2b, 0x2b2b2b08, + 0x082b0808, 0x2b2b2b08, 0x082b082b, 0x2b2b2b08, 0x082b2b08, 0x2b2b2b08, 0x2b2b0808, 0x2b2b2b08, + 0x2b2b2b08, 0x2b2b2b08, 0x08081908, 0x2b2b2b19, 0x2b081908, 0x2b2b2b19, 0x2b08192b, 0x2b2b2b19, + 0x082b2b08, 0x2b2b2b2b, 0x082b2b2b, 0x2b2b2b2b, 0x2b190819, 0x2b2b2b2b, 0x2b2b2b2b, 0x2b2b2b2b +); +#enddecl(IQ2_XS_GRID) + +#decl(IQ2_S_GRID) +const iq2s_grid = array<u32, 2048>( + 0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808, + 0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x0819192b, 0x08080808, + 0x08192b19, 0x08080808, 0x082b0808, 0x08080808, 0x082b082b, 0x08080808, 0x082b1919, 0x08080808, + 0x082b2b08, 0x08080808, 0x19080819, 0x08080808, 0x19081908, 0x08080808, 0x1908192b, 0x08080808, + 0x19082b19, 0x08080808, 0x19190808, 0x08080808, 0x1919082b, 0x08080808, 0x19191919, 0x08080808, + 0x19192b08, 0x08080808, 0x192b0819, 0x08080808, 0x192b1908, 0x08080808, 0x192b192b, 0x08080808, + 0x192b2b19, 0x08080808, 0x2b080808, 0x08080808, 0x2b08082b, 0x08080808, 0x2b081919, 0x08080808, + 0x2b082b08, 0x08080808, 0x2b190819, 0x08080808, 0x2b191908, 0x08080808, 0x2b2b0808, 0x08080808, + 0x2b2b1919, 0x08080808, 0x2b2b2b2b, 0x08080808, 0x08080819, 0x08080819, 0x08081908, 0x08080819, + 0x0808192b, 0x08080819, 0x08082b19, 0x08080819, 0x08190808, 0x08080819, 0x0819082b, 0x08080819, + 0x08191919, 0x08080819, 0x08192b08, 0x08080819, 0x082b0819, 0x08080819, 0x082b1908, 0x08080819, + 0x19080808, 0x08080819, 0x1908082b, 0x08080819, 0x19081919, 0x08080819, 0x19082b08, 0x08080819, + 0x19190819, 0x08080819, 0x19191908, 0x08080819, 0x1919192b, 0x08080819, 0x19192b19, 0x08080819, + 0x192b0808, 0x08080819, 0x192b1919, 0x08080819, 0x192b2b08, 0x08080819, 0x2b080819, 0x08080819, + 0x2b081908, 0x08080819, 0x2b190808, 0x08080819, 0x2b19082b, 0x08080819, 0x2b191919, 0x08080819, + 0x2b2b0819, 0x08080819, 0x2b2b1908, 0x08080819, 0x08080808, 0x0808082b, 0x0808082b, 0x0808082b, + 0x08081919, 0x0808082b, 0x08082b08, 0x0808082b, 0x08190819, 0x0808082b, 0x08191908, 0x0808082b, + 0x082b0808, 0x0808082b, 0x082b2b2b, 0x0808082b, 0x19080819, 0x0808082b, 0x19081908, 0x0808082b, + 0x1908192b, 0x0808082b, 0x19082b19, 0x0808082b, 0x19190808, 0x0808082b, 0x19191919, 0x0808082b, + 0x2b080808, 0x0808082b, 0x2b081919, 0x0808082b, 0x2b082b2b, 0x0808082b, 0x2b191908, 0x0808082b, + 0x2b2b082b, 0x0808082b, 0x08080819, 0x08081908, 0x08081908, 0x08081908, 0x0808192b, 0x08081908, + 0x08082b19, 0x08081908, 0x08190808, 0x08081908, 0x0819082b, 0x08081908, 0x08191919, 0x08081908, + 0x08192b08, 0x08081908, 0x082b0819, 0x08081908, 0x082b1908, 0x08081908, 0x082b192b, 0x08081908, + 0x082b2b19, 0x08081908, 0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19081919, 0x08081908, + 0x19082b08, 0x08081908, 0x19082b2b, 0x08081908, 0x19190819, 0x08081908, 0x19191908, 0x08081908, + 0x1919192b, 0x08081908, 0x19192b19, 0x08081908, 0x192b0808, 0x08081908, 0x192b082b, 0x08081908, + 0x192b1919, 0x08081908, 0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b08192b, 0x08081908, + 0x2b082b19, 0x08081908, 0x2b190808, 0x08081908, 0x2b191919, 0x08081908, 0x2b192b08, 0x08081908, + 0x2b2b0819, 0x08081908, 0x2b2b1908, 0x08081908, 0x08080808, 0x08081919, 0x0808082b, 0x08081919, + 0x08081919, 0x08081919, 0x08082b08, 0x08081919, 0x08082b2b, 0x08081919, 0x08190819, 0x08081919, + 0x08191908, 0x08081919, 0x0819192b, 0x08081919, 0x08192b19, 0x08081919, 0x082b0808, 0x08081919, + 0x082b1919, 0x08081919, 0x082b2b08, 0x08081919, 0x19080819, 0x08081919, 0x19081908, 0x08081919, + 0x1908192b, 0x08081919, 0x19082b19, 0x08081919, 0x19190808, 0x08081919, 0x1919082b, 0x08081919, + 0x19191919, 0x08081919, 0x19192b08, 0x08081919, 0x192b0819, 0x08081919, 0x192b1908, 0x08081919, + 0x2b080808, 0x08081919, 0x2b08082b, 0x08081919, 0x2b081919, 0x08081919, 0x2b082b08, 0x08081919, + 0x2b190819, 0x08081919, 0x2b191908, 0x08081919, 0x2b2b0808, 0x08081919, 0x08080819, 0x0808192b, + 0x08081908, 0x0808192b, 0x0808192b, 0x0808192b, 0x08082b19, 0x0808192b, 0x08190808, 0x0808192b, + 0x08191919, 0x0808192b, 0x19080808, 0x0808192b, 0x19081919, 0x0808192b, 0x19082b08, 0x0808192b, + 0x19190819, 0x0808192b, 0x19191908, 0x0808192b, 0x192b0808, 0x0808192b, 0x2b080819, 0x0808192b, + 0x2b081908, 0x0808192b, 0x2b190808, 0x0808192b, 0x08080808, 0x08082b08, 0x0808082b, 0x08082b08, + 0x08081919, 0x08082b08, 0x08082b08, 0x08082b08, 0x08190819, 0x08082b08, 0x08191908, 0x08082b08, + 0x0819192b, 0x08082b08, 0x08192b19, 0x08082b08, 0x082b0808, 0x08082b08, 0x082b1919, 0x08082b08, + 0x082b2b2b, 0x08082b08, 0x19080819, 0x08082b08, 0x19081908, 0x08082b08, 0x1908192b, 0x08082b08, + 0x19082b19, 0x08082b08, 0x19190808, 0x08082b08, 0x1919082b, 0x08082b08, 0x19191919, 0x08082b08, + 0x19192b08, 0x08082b08, 0x192b0819, 0x08082b08, 0x192b1908, 0x08082b08, 0x2b080808, 0x08082b08, + 0x2b081919, 0x08082b08, 0x2b191908, 0x08082b08, 0x2b2b2b2b, 0x08082b08, 0x08080819, 0x08082b19, + 0x08081908, 0x08082b19, 0x08190808, 0x08082b19, 0x0819082b, 0x08082b19, 0x08191919, 0x08082b19, + 0x08192b08, 0x08082b19, 0x082b0819, 0x08082b19, 0x19080808, 0x08082b19, 0x19081919, 0x08082b19, + 0x19082b08, 0x08082b19, 0x19190819, 0x08082b19, 0x19191908, 0x08082b19, 0x192b0808, 0x08082b19, + 0x2b080819, 0x08082b19, 0x2b190808, 0x08082b19, 0x08080808, 0x08082b2b, 0x08190819, 0x08082b2b, + 0x08191908, 0x08082b2b, 0x082b082b, 0x08082b2b, 0x082b2b08, 0x08082b2b, 0x082b2b2b, 0x08082b2b, + 0x19190808, 0x08082b2b, 0x2b192b19, 0x08082b2b, 0x08080819, 0x08190808, 0x08081908, 0x08190808, + 0x0808192b, 0x08190808, 0x08082b19, 0x08190808, 0x08190808, 0x08190808, 0x0819082b, 0x08190808, + 0x08191919, 0x08190808, 0x08192b08, 0x08190808, 0x082b0819, 0x08190808, 0x082b1908, 0x08190808, + 0x082b192b, 0x08190808, 0x19080808, 0x08190808, 0x1908082b, 0x08190808, 0x19081919, 0x08190808, + 0x19082b08, 0x08190808, 0x19190819, 0x08190808, 0x19191908, 0x08190808, 0x1919192b, 0x08190808, + 0x19192b19, 0x08190808, 0x192b0808, 0x08190808, 0x192b082b, 0x08190808, 0x192b1919, 0x08190808, + 0x192b2b08, 0x08190808, 0x2b080819, 0x08190808, 0x2b081908, 0x08190808, 0x2b08192b, 0x08190808, + 0x2b190808, 0x08190808, 0x2b191919, 0x08190808, 0x2b192b08, 0x08190808, 0x2b2b0819, 0x08190808, + 0x2b2b1908, 0x08190808, 0x08080808, 0x08190819, 0x0808082b, 0x08190819, 0x08081919, 0x08190819, + 0x08082b08, 0x08190819, 0x08082b2b, 0x08190819, 0x08190819, 0x08190819, 0x08191908, 0x08190819, + 0x0819192b, 0x08190819, 0x08192b19, 0x08190819, 0x082b0808, 0x08190819, 0x082b082b, 0x08190819, + 0x082b1919, 0x08190819, 0x082b2b08, 0x08190819, 0x19080819, 0x08190819, 0x19081908, 0x08190819, + 0x1908192b, 0x08190819, 0x19082b19, 0x08190819, 0x19190808, 0x08190819, 0x1919082b, 0x08190819, + 0x19191919, 0x08190819, 0x19192b08, 0x08190819, 0x192b0819, 0x08190819, 0x192b1908, 0x08190819, + 0x2b080808, 0x08190819, 0x2b08082b, 0x08190819, 0x2b081919, 0x08190819, 0x2b082b08, 0x08190819, + 0x2b190819, 0x08190819, 0x2b191908, 0x08190819, 0x08080819, 0x0819082b, 0x08081908, 0x0819082b, + 0x08082b19, 0x0819082b, 0x08190808, 0x0819082b, 0x08191919, 0x0819082b, 0x082b0819, 0x0819082b, + 0x082b1908, 0x0819082b, 0x19080808, 0x0819082b, 0x19081919, 0x0819082b, 0x19190819, 0x0819082b, + 0x19191908, 0x0819082b, 0x2b080819, 0x0819082b, 0x2b081908, 0x0819082b, 0x2b190808, 0x0819082b, + 0x08080808, 0x08191908, 0x0808082b, 0x08191908, 0x08081919, 0x08191908, 0x08082b08, 0x08191908, + 0x08190819, 0x08191908, 0x08191908, 0x08191908, 0x0819192b, 0x08191908, 0x08192b19, 0x08191908, + 0x082b0808, 0x08191908, 0x082b1919, 0x08191908, 0x082b2b08, 0x08191908, 0x19080819, 0x08191908, + 0x19081908, 0x08191908, 0x1908192b, 0x08191908, 0x19082b19, 0x08191908, 0x19190808, 0x08191908, + 0x1919082b, 0x08191908, 0x19191919, 0x08191908, 0x19192b08, 0x08191908, 0x192b0819, 0x08191908, + 0x192b1908, 0x08191908, 0x2b080808, 0x08191908, 0x2b08082b, 0x08191908, 0x2b081919, 0x08191908, + 0x2b082b08, 0x08191908, 0x2b190819, 0x08191908, 0x2b191908, 0x08191908, 0x2b2b0808, 0x08191908, + 0x08080819, 0x08191919, 0x08081908, 0x08191919, 0x0808192b, 0x08191919, 0x08082b19, 0x08191919, + 0x08190808, 0x08191919, 0x0819082b, 0x08191919, 0x08191919, 0x08191919, 0x08192b08, 0x08191919, + 0x082b0819, 0x08191919, 0x082b1908, 0x08191919, 0x19080808, 0x08191919, 0x1908082b, 0x08191919, + 0x19081919, 0x08191919, 0x19082b08, 0x08191919, 0x19190819, 0x08191919, 0x19191908, 0x08191919, + 0x192b0808, 0x08191919, 0x2b080819, 0x08191919, 0x2b081908, 0x08191919, 0x2b190808, 0x08191919, + 0x08080808, 0x0819192b, 0x08081919, 0x0819192b, 0x08082b08, 0x0819192b, 0x08190819, 0x0819192b, + 0x08191908, 0x0819192b, 0x082b0808, 0x0819192b, 0x19080819, 0x0819192b, 0x19081908, 0x0819192b, + 0x19190808, 0x0819192b, 0x2b080808, 0x0819192b, 0x2b2b2b2b, 0x0819192b, 0x08080819, 0x08192b08, + 0x08081908, 0x08192b08, 0x0808192b, 0x08192b08, 0x08082b19, 0x08192b08, 0x08190808, 0x08192b08, + 0x08191919, 0x08192b08, 0x08192b08, 0x08192b08, 0x082b0819, 0x08192b08, 0x19080808, 0x08192b08, + 0x1908082b, 0x08192b08, 0x19081919, 0x08192b08, 0x19082b08, 0x08192b08, 0x19190819, 0x08192b08, + 0x19191908, 0x08192b08, 0x192b0808, 0x08192b08, 0x2b080819, 0x08192b08, 0x2b081908, 0x08192b08, + 0x08080808, 0x08192b19, 0x0808082b, 0x08192b19, 0x08081919, 0x08192b19, 0x08082b08, 0x08192b19, + 0x08190819, 0x08192b19, 0x08191908, 0x08192b19, 0x082b0808, 0x08192b19, 0x19080819, 0x08192b19, + 0x19081908, 0x08192b19, 0x19190808, 0x08192b19, 0x192b2b19, 0x08192b19, 0x2b2b082b, 0x08192b19, + 0x08081908, 0x08192b2b, 0x08190808, 0x08192b2b, 0x19080808, 0x08192b2b, 0x1919192b, 0x08192b2b, + 0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08081919, 0x082b0808, 0x08082b08, 0x082b0808, + 0x08190819, 0x082b0808, 0x08191908, 0x082b0808, 0x0819192b, 0x082b0808, 0x08192b19, 0x082b0808, + 0x082b0808, 0x082b0808, 0x082b1919, 0x082b0808, 0x082b2b2b, 0x082b0808, 0x19080819, 0x082b0808, + 0x19081908, 0x082b0808, 0x19190808, 0x082b0808, 0x1919082b, 0x082b0808, 0x19191919, 0x082b0808, + 0x192b1908, 0x082b0808, 0x2b080808, 0x082b0808, 0x2b082b2b, 0x082b0808, 0x2b191908, 0x082b0808, + 0x2b2b2b2b, 0x082b0808, 0x08080819, 0x082b0819, 0x08081908, 0x082b0819, 0x08190808, 0x082b0819, + 0x0819082b, 0x082b0819, 0x08191919, 0x082b0819, 0x082b0819, 0x082b0819, 0x19080808, 0x082b0819, + 0x1908082b, 0x082b0819, 0x19081919, 0x082b0819, 0x19190819, 0x082b0819, 0x19191908, 0x082b0819, + 0x192b0808, 0x082b0819, 0x2b080819, 0x082b0819, 0x2b081908, 0x082b0819, 0x2b190808, 0x082b0819, + 0x08080808, 0x082b082b, 0x08082b2b, 0x082b082b, 0x082b082b, 0x082b082b, 0x082b2b08, 0x082b082b, + 0x082b2b2b, 0x082b082b, 0x19081908, 0x082b082b, 0x19190808, 0x082b082b, 0x2b082b08, 0x082b082b, + 0x2b082b2b, 0x082b082b, 0x2b2b2b08, 0x082b082b, 0x08080819, 0x082b1908, 0x08081908, 0x082b1908, + 0x0808192b, 0x082b1908, 0x08082b19, 0x082b1908, 0x08190808, 0x082b1908, 0x08191919, 0x082b1908, + 0x08192b08, 0x082b1908, 0x082b0819, 0x082b1908, 0x082b1908, 0x082b1908, 0x19080808, 0x082b1908, + 0x1908082b, 0x082b1908, 0x19081919, 0x082b1908, 0x19082b08, 0x082b1908, 0x19190819, 0x082b1908, + 0x19191908, 0x082b1908, 0x192b0808, 0x082b1908, 0x2b080819, 0x082b1908, 0x2b081908, 0x082b1908, + 0x2b190808, 0x082b1908, 0x08080808, 0x082b1919, 0x08081919, 0x082b1919, 0x08082b08, 0x082b1919, + 0x08190819, 0x082b1919, 0x08191908, 0x082b1919, 0x082b0808, 0x082b1919, 0x19080819, 0x082b1919, + 0x19081908, 0x082b1919, 0x19190808, 0x082b1919, 0x192b192b, 0x082b1919, 0x2b080808, 0x082b1919, + 0x08080819, 0x082b192b, 0x08081908, 0x082b192b, 0x08190808, 0x082b192b, 0x19080808, 0x082b192b, + 0x19192b19, 0x082b192b, 0x08080808, 0x082b2b08, 0x08081919, 0x082b2b08, 0x08190819, 0x082b2b08, + 0x08191908, 0x082b2b08, 0x19080819, 0x082b2b08, 0x19081908, 0x082b2b08, 0x19190808, 0x082b2b08, + 0x2b082b2b, 0x082b2b08, 0x2b2b2b2b, 0x082b2b08, 0x08080819, 0x082b2b19, 0x08081908, 0x082b2b19, + 0x08190808, 0x082b2b19, 0x2b191919, 0x082b2b19, 0x08082b2b, 0x082b2b2b, 0x082b082b, 0x082b2b2b, + 0x192b1908, 0x082b2b2b, 0x2b082b08, 0x082b2b2b, 0x2b082b2b, 0x082b2b2b, 0x08080819, 0x19080808, + 0x08081908, 0x19080808, 0x0808192b, 0x19080808, 0x08082b19, 0x19080808, 0x08190808, 0x19080808, + 0x0819082b, 0x19080808, 0x08191919, 0x19080808, 0x08192b08, 0x19080808, 0x08192b2b, 0x19080808, + 0x082b0819, 0x19080808, 0x082b1908, 0x19080808, 0x082b192b, 0x19080808, 0x19080808, 0x19080808, + 0x1908082b, 0x19080808, 0x19081919, 0x19080808, 0x19082b08, 0x19080808, 0x19082b2b, 0x19080808, + 0x19190819, 0x19080808, 0x19191908, 0x19080808, 0x1919192b, 0x19080808, 0x19192b19, 0x19080808, + 0x192b0808, 0x19080808, 0x192b082b, 0x19080808, 0x192b1919, 0x19080808, 0x2b080819, 0x19080808, + 0x2b081908, 0x19080808, 0x2b190808, 0x19080808, 0x2b191919, 0x19080808, 0x2b192b08, 0x19080808, + 0x2b2b0819, 0x19080808, 0x2b2b1908, 0x19080808, 0x08080808, 0x19080819, 0x0808082b, 0x19080819, + 0x08081919, 0x19080819, 0x08082b08, 0x19080819, 0x08190819, 0x19080819, 0x08191908, 0x19080819, + 0x0819192b, 0x19080819, 0x08192b19, 0x19080819, 0x082b0808, 0x19080819, 0x082b082b, 0x19080819, + 0x082b1919, 0x19080819, 0x19080819, 0x19080819, 0x19081908, 0x19080819, 0x1908192b, 0x19080819, + 0x19082b19, 0x19080819, 0x19190808, 0x19080819, 0x1919082b, 0x19080819, 0x19191919, 0x19080819, + 0x19192b08, 0x19080819, 0x192b0819, 0x19080819, 0x192b1908, 0x19080819, 0x2b080808, 0x19080819, + 0x2b08082b, 0x19080819, 0x2b081919, 0x19080819, 0x2b082b08, 0x19080819, 0x2b190819, 0x19080819, + 0x2b191908, 0x19080819, 0x2b2b0808, 0x19080819, 0x08080819, 0x1908082b, 0x08081908, 0x1908082b, + 0x08190808, 0x1908082b, 0x0819082b, 0x1908082b, 0x08191919, 0x1908082b, 0x08192b08, 0x1908082b, + 0x082b1908, 0x1908082b, 0x19080808, 0x1908082b, 0x19081919, 0x1908082b, 0x19082b08, 0x1908082b, + 0x19190819, 0x1908082b, 0x19191908, 0x1908082b, 0x192b0808, 0x1908082b, 0x2b080819, 0x1908082b, + 0x2b081908, 0x1908082b, 0x08080808, 0x19081908, 0x0808082b, 0x19081908, 0x08081919, 0x19081908, + 0x08082b08, 0x19081908, 0x08082b2b, 0x19081908, 0x08190819, 0x19081908, 0x08191908, 0x19081908, + 0x0819192b, 0x19081908, 0x08192b19, 0x19081908, 0x082b0808, 0x19081908, 0x082b082b, 0x19081908, + 0x082b1919, 0x19081908, 0x082b2b08, 0x19081908, 0x19080819, 0x19081908, 0x19081908, 0x19081908, + 0x1908192b, 0x19081908, 0x19082b19, 0x19081908, 0x19190808, 0x19081908, 0x1919082b, 0x19081908, + 0x19191919, 0x19081908, 0x19192b08, 0x19081908, 0x192b0819, 0x19081908, 0x192b1908, 0x19081908, + 0x2b080808, 0x19081908, 0x2b08082b, 0x19081908, 0x2b081919, 0x19081908, 0x2b082b08, 0x19081908, + 0x2b190819, 0x19081908, 0x2b191908, 0x19081908, 0x2b2b0808, 0x19081908, 0x08080819, 0x19081919, + 0x08081908, 0x19081919, 0x0808192b, 0x19081919, 0x08082b19, 0x19081919, 0x08190808, 0x19081919, + 0x0819082b, 0x19081919, 0x08191919, 0x19081919, 0x08192b08, 0x19081919, 0x082b0819, 0x19081919, + 0x082b1908, 0x19081919, 0x19080808, 0x19081919, 0x1908082b, 0x19081919, 0x19081919, 0x19081919, + 0x19082b08, 0x19081919, 0x19190819, 0x19081919, 0x19191908, 0x19081919, 0x192b0808, 0x19081919, + 0x192b2b2b, 0x19081919, 0x2b080819, 0x19081919, 0x2b081908, 0x19081919, 0x2b190808, 0x19081919, + 0x08080808, 0x1908192b, 0x0808082b, 0x1908192b, 0x08081919, 0x1908192b, 0x08082b08, 0x1908192b, + 0x08190819, 0x1908192b, 0x08191908, 0x1908192b, 0x082b0808, 0x1908192b, 0x19080819, 0x1908192b, + 0x19081908, 0x1908192b, 0x19190808, 0x1908192b, 0x2b080808, 0x1908192b, 0x2b2b1919, 0x1908192b, + 0x08080819, 0x19082b08, 0x08081908, 0x19082b08, 0x08082b19, 0x19082b08, 0x08190808, 0x19082b08, + 0x0819082b, 0x19082b08, 0x08191919, 0x19082b08, 0x08192b08, 0x19082b08, 0x082b0819, 0x19082b08, + 0x082b1908, 0x19082b08, 0x19080808, 0x19082b08, 0x1908082b, 0x19082b08, 0x19081919, 0x19082b08, + 0x19082b08, 0x19082b08, 0x19190819, 0x19082b08, 0x19191908, 0x19082b08, 0x192b0808, 0x19082b08, + 0x2b081908, 0x19082b08, 0x2b190808, 0x19082b08, 0x08080808, 0x19082b19, 0x0808082b, 0x19082b19, + 0x08081919, 0x19082b19, 0x08082b08, 0x19082b19, 0x08190819, 0x19082b19, 0x08191908, 0x19082b19, + 0x082b0808, 0x19082b19, 0x19080819, 0x19082b19, 0x19081908, 0x19082b19, 0x19190808, 0x19082b19, + 0x2b080808, 0x19082b19, 0x2b19192b, 0x19082b19, 0x08080819, 0x19082b2b, 0x08081908, 0x19082b2b, + 0x08190808, 0x19082b2b, 0x19080808, 0x19082b2b, 0x08080808, 0x19190808, 0x0808082b, 0x19190808, + 0x08081919, 0x19190808, 0x08082b08, 0x19190808, 0x08190819, 0x19190808, 0x08191908, 0x19190808, + 0x0819192b, 0x19190808, 0x08192b19, 0x19190808, 0x082b0808, 0x19190808, 0x082b082b, 0x19190808, + 0x082b1919, 0x19190808, 0x082b2b08, 0x19190808, 0x19080819, 0x19190808, 0x19081908, 0x19190808, + 0x1908192b, 0x19190808, 0x19082b19, 0x19190808, 0x19190808, 0x19190808, 0x1919082b, 0x19190808, + 0x19191919, 0x19190808, 0x19192b08, 0x19190808, 0x192b0819, 0x19190808, 0x192b1908, 0x19190808, + 0x2b080808, 0x19190808, 0x2b08082b, 0x19190808, 0x2b081919, 0x19190808, 0x2b082b08, 0x19190808, + 0x2b190819, 0x19190808, 0x2b191908, 0x19190808, 0x08080819, 0x19190819, 0x08081908, 0x19190819, + 0x0808192b, 0x19190819, 0x08082b19, 0x19190819, 0x08190808, 0x19190819, 0x0819082b, 0x19190819, + 0x08191919, 0x19190819, 0x08192b08, 0x19190819, 0x082b0819, 0x19190819, 0x082b1908, 0x19190819, + 0x19080808, 0x19190819, 0x1908082b, 0x19190819, 0x19081919, 0x19190819, 0x19082b08, 0x19190819, + 0x19190819, 0x19190819, 0x19191908, 0x19190819, 0x192b0808, 0x19190819, 0x2b080819, 0x19190819, + 0x2b081908, 0x19190819, 0x2b190808, 0x19190819, 0x08080808, 0x1919082b, 0x08081919, 0x1919082b, + 0x08082b08, 0x1919082b, 0x08190819, 0x1919082b, 0x08191908, 0x1919082b, 0x082b0808, 0x1919082b, + 0x19080819, 0x1919082b, 0x19081908, 0x1919082b, 0x19190808, 0x1919082b, 0x192b2b19, 0x1919082b, + 0x2b080808, 0x1919082b, 0x08080819, 0x19191908, 0x08081908, 0x19191908, 0x0808192b, 0x19191908, + 0x08082b19, 0x19191908, 0x08190808, 0x19191908, 0x0819082b, 0x19191908, 0x08191919, 0x19191908, + 0x08192b08, 0x19191908, 0x082b0819, 0x19191908, 0x082b1908, 0x19191908, 0x19080808, 0x19191908, + 0x1908082b, 0x19191908, 0x19081919, 0x19191908, 0x19082b08, 0x19191908, 0x19190819, 0x19191908, + 0x19191908, 0x19191908, 0x192b0808, 0x19191908, 0x2b080819, 0x19191908, 0x2b081908, 0x19191908, + 0x2b190808, 0x19191908, 0x08080808, 0x19191919, 0x0808082b, 0x19191919, 0x08081919, 0x19191919, + 0x08082b08, 0x19191919, 0x08190819, 0x19191919, 0x08191908, 0x19191919, 0x082b0808, 0x19191919, + 0x19080819, 0x19191919, 0x19081908, 0x19191919, 0x19190808, 0x19191919, 0x2b080808, 0x19191919, + 0x08080819, 0x1919192b, 0x08081908, 0x1919192b, 0x08190808, 0x1919192b, 0x082b192b, 0x1919192b, + 0x19080808, 0x1919192b, 0x08080808, 0x19192b08, 0x0808082b, 0x19192b08, 0x08081919, 0x19192b08, + 0x08082b08, 0x19192b08, 0x08190819, 0x19192b08, 0x08191908, 0x19192b08, 0x082b0808, 0x19192b08, + 0x19080819, 0x19192b08, 0x19081908, 0x19192b08, 0x19190808, 0x19192b08, 0x19192b2b, 0x19192b08, + 0x2b080808, 0x19192b08, 0x08080819, 0x19192b19, 0x08081908, 0x19192b19, 0x08190808, 0x19192b19, + 0x19080808, 0x19192b19, 0x08080808, 0x19192b2b, 0x08192b19, 0x19192b2b, 0x2b081919, 0x19192b2b, + 0x2b2b2b08, 0x19192b2b, 0x08080819, 0x192b0808, 0x08081908, 0x192b0808, 0x0808192b, 0x192b0808, + 0x08190808, 0x192b0808, 0x0819082b, 0x192b0808, 0x08191919, 0x192b0808, 0x08192b08, 0x192b0808, + 0x082b0819, 0x192b0808, 0x082b1908, 0x192b0808, 0x19080808, 0x192b0808, 0x19081919, 0x192b0808, + 0x19082b08, 0x192b0808, 0x19190819, 0x192b0808, 0x19191908, 0x192b0808, 0x192b0808, 0x192b0808, + 0x2b081908, 0x192b0808, 0x2b190808, 0x192b0808, 0x08080808, 0x192b0819, 0x0808082b, 0x192b0819, + 0x08081919, 0x192b0819, 0x08082b08, 0x192b0819, 0x08190819, 0x192b0819, 0x08191908, 0x192b0819, + 0x082b0808, 0x192b0819, 0x19080819, 0x192b0819, 0x19081908, 0x192b0819, 0x19190808, 0x192b0819, + 0x2b080808, 0x192b0819, 0x2b192b19, 0x192b0819, 0x08081908, 0x192b082b, 0x08190808, 0x192b082b, + 0x19080808, 0x192b082b, 0x1919192b, 0x192b082b, 0x2b2b0819, 0x192b082b, 0x08080808, 0x192b1908, + 0x08081919, 0x192b1908, 0x08082b08, 0x192b1908, 0x08190819, 0x192b1908, 0x08191908, 0x192b1908, + 0x082b0808, 0x192b1908, 0x19080819, 0x192b1908, 0x19081908, 0x192b1908, 0x19190808, 0x192b1908, + 0x2b080808, 0x192b1908, 0x08080819, 0x192b1919, 0x08081908, 0x192b1919, 0x08190808, 0x192b1919, + 0x19080808, 0x192b1919, 0x19082b2b, 0x192b1919, 0x192b2b08, 0x192b1919, 0x2b19082b, 0x192b1919, + 0x08080808, 0x192b192b, 0x2b191908, 0x192b192b, 0x08080819, 0x192b2b08, 0x08081908, 0x192b2b08, + 0x08190808, 0x192b2b08, 0x192b1919, 0x192b2b08, 0x2b192b08, 0x192b2b08, 0x08080808, 0x192b2b19, + 0x082b2b2b, 0x192b2b19, 0x1908082b, 0x192b2b2b, 0x2b2b0819, 0x192b2b2b, 0x08080808, 0x2b080808, + 0x0808082b, 0x2b080808, 0x08081919, 0x2b080808, 0x08082b08, 0x2b080808, 0x08190819, 0x2b080808, + 0x08191908, 0x2b080808, 0x08192b19, 0x2b080808, 0x082b0808, 0x2b080808, 0x082b1919, 0x2b080808, + 0x19080819, 0x2b080808, 0x19081908, 0x2b080808, 0x19190808, 0x2b080808, 0x1919082b, 0x2b080808, + 0x19191919, 0x2b080808, 0x19192b08, 0x2b080808, 0x192b0819, 0x2b080808, 0x2b080808, 0x2b080808, + 0x2b081919, 0x2b080808, 0x2b190819, 0x2b080808, 0x2b191908, 0x2b080808, 0x08080819, 0x2b080819, + 0x08081908, 0x2b080819, 0x08082b19, 0x2b080819, 0x08190808, 0x2b080819, 0x0819082b, 0x2b080819, + 0x08191919, 0x2b080819, 0x08192b08, 0x2b080819, 0x082b0819, 0x2b080819, 0x082b1908, 0x2b080819, + 0x19080808, 0x2b080819, 0x1908082b, 0x2b080819, 0x19081919, 0x2b080819, 0x19082b08, 0x2b080819, + 0x19190819, 0x2b080819, 0x19191908, 0x2b080819, 0x2b080819, 0x2b080819, 0x2b081908, 0x2b080819, + 0x2b190808, 0x2b080819, 0x2b2b2b19, 0x2b080819, 0x08080808, 0x2b08082b, 0x08081919, 0x2b08082b, + 0x08082b2b, 0x2b08082b, 0x08190819, 0x2b08082b, 0x08191908, 0x2b08082b, 0x19080819, 0x2b08082b, + 0x19081908, 0x2b08082b, 0x19190808, 0x2b08082b, 0x08080819, 0x2b081908, 0x08081908, 0x2b081908, + 0x0808192b, 0x2b081908, 0x08082b19, 0x2b081908, 0x08190808, 0x2b081908, 0x0819082b, 0x2b081908, + 0x08191919, 0x2b081908, 0x08192b08, 0x2b081908, 0x082b0819, 0x2b081908, 0x19080808, 0x2b081908, + 0x1908082b, 0x2b081908, 0x19081919, 0x2b081908, 0x19082b08, 0x2b081908, 0x19190819, 0x2b081908, + 0x19191908, 0x2b081908, 0x192b0808, 0x2b081908, 0x2b080819, 0x2b081908, 0x2b081908, 0x2b081908, + 0x2b190808, 0x2b081908, 0x08080808, 0x2b081919, 0x0808082b, 0x2b081919, 0x08081919, 0x2b081919, + 0x08082b08, 0x2b081919, 0x08190819, 0x2b081919, 0x08191908, 0x2b081919, 0x082b0808, 0x2b081919, + 0x19080819, 0x2b081919, 0x19081908, 0x2b081919, 0x19190808, 0x2b081919, 0x2b080808, 0x2b081919, + 0x2b082b2b, 0x2b081919, 0x08080819, 0x2b08192b, 0x08081908, 0x2b08192b, 0x08190808, 0x2b08192b, + 0x082b2b19, 0x2b08192b, 0x19080808, 0x2b08192b, 0x08080808, 0x2b082b08, 0x08081919, 0x2b082b08, + 0x08190819, 0x2b082b08, 0x08191908, 0x2b082b08, 0x19080819, 0x2b082b08, 0x19081908, 0x2b082b08, + 0x19190808, 0x2b082b08, 0x2b2b082b, 0x2b082b08, 0x08080819, 0x2b082b19, 0x08081908, 0x2b082b19, + 0x19080808, 0x2b082b19, 0x192b1919, 0x2b082b19, 0x082b082b, 0x2b082b2b, 0x19192b08, 0x2b082b2b, + 0x19192b2b, 0x2b082b2b, 0x2b08082b, 0x2b082b2b, 0x2b2b082b, 0x2b082b2b, 0x08080819, 0x2b190808, + 0x08081908, 0x2b190808, 0x08082b19, 0x2b190808, 0x08190808, 0x2b190808, 0x0819082b, 0x2b190808, + 0x08191919, 0x2b190808, 0x08192b08, 0x2b190808, 0x082b1908, 0x2b190808, 0x19080808, 0x2b190808, + 0x1908082b, 0x2b190808, 0x19081919, 0x2b190808, 0x19082b08, 0x2b190808, 0x19190819, 0x2b190808, + 0x19191908, 0x2b190808, 0x192b0808, 0x2b190808, 0x2b080819, 0x2b190808, 0x2b081908, 0x2b190808, + 0x2b190808, 0x2b190808, 0x08080808, 0x2b190819, 0x08081919, 0x2b190819, 0x08190819, 0x2b190819, + 0x08191908, 0x2b190819, 0x19080819, 0x2b190819, 0x19081908, 0x2b190819, 0x19190808, 0x2b190819, + 0x19192b2b, 0x2b190819, 0x08080819, 0x2b19082b, 0x08081908, 0x2b19082b, 0x08190808, 0x2b19082b, + 0x19080808, 0x2b19082b, 0x2b2b192b, 0x2b19082b, 0x08080808, 0x2b191908, 0x0808082b, 0x2b191908, + 0x08081919, 0x2b191908, 0x08082b08, 0x2b191908, 0x08190819, 0x2b191908, 0x08191908, 0x2b191908, + 0x082b0808, 0x2b191908, 0x19080819, 0x2b191908, 0x19081908, 0x2b191908, 0x19190808, 0x2b191908, + 0x2b080808, 0x2b191908, 0x2b19192b, 0x2b191908, 0x08080819, 0x2b191919, 0x08081908, 0x2b191919, + 0x08190808, 0x2b191919, 0x19080808, 0x2b191919, 0x2b192b08, 0x2b191919, 0x2b2b0819, 0x2b191919, + 0x08080808, 0x2b19192b, 0x1908192b, 0x2b19192b, 0x192b1908, 0x2b19192b, 0x08080819, 0x2b192b08, + 0x08081908, 0x2b192b08, 0x08190808, 0x2b192b08, 0x082b192b, 0x2b192b08, 0x19080808, 0x2b192b08, + 0x2b2b2b19, 0x2b192b08, 0x08080808, 0x2b192b19, 0x19082b19, 0x2b192b19, 0x1919082b, 0x2b192b19, + 0x2b190808, 0x2b192b2b, 0x08080808, 0x2b2b0808, 0x08081919, 0x2b2b0808, 0x08082b2b, 0x2b2b0808, + 0x08191908, 0x2b2b0808, 0x082b082b, 0x2b2b0808, 0x082b2b2b, 0x2b2b0808, 0x19080819, 0x2b2b0808, + 0x19081908, 0x2b2b0808, 0x19190808, 0x2b2b0808, 0x2b2b082b, 0x2b2b0808, 0x2b2b2b2b, 0x2b2b0808, + 0x19080808, 0x2b2b0819, 0x192b1919, 0x2b2b0819, 0x0808082b, 0x2b2b082b, 0x08082b2b, 0x2b2b082b, + 0x082b082b, 0x2b2b082b, 0x082b2b08, 0x2b2b082b, 0x082b2b2b, 0x2b2b082b, 0x2b08082b, 0x2b2b082b, + 0x2b082b08, 0x2b2b082b, 0x2b082b2b, 0x2b2b082b, 0x2b2b2b08, 0x2b2b082b, 0x08080819, 0x2b2b1908, + 0x08081908, 0x2b2b1908, 0x08190808, 0x2b2b1908, 0x19080808, 0x2b2b1908, 0x2b082b19, 0x2b2b1908, + 0x2b2b1908, 0x2b2b1908, 0x08080808, 0x2b2b1919, 0x08192b19, 0x2b2b1919, 0x19190819, 0x2b2b192b, + 0x08082b2b, 0x2b2b2b08, 0x082b2b08, 0x2b2b2b08, 0x2b2b082b, 0x2b2b2b08, 0x19191908, 0x2b2b2b19, + 0x2b08192b, 0x2b2b2b19, 0x08082b08, 0x2b2b2b2b, 0x08082b2b, 0x2b2b2b2b, 0x082b0808, 0x2b2b2b2b, + 0x082b082b, 0x2b2b2b2b, 0x082b2b08, 0x2b2b2b2b, 0x2b082b08, 0x2b2b2b2b, 0x2b2b2b2b, 0x2b2b2b2b +); +#enddecl(IQ2_S_GRID) + +#decl(IQ3_XSS_GRID) + +const iq3xxs_grid = array<u32, 256>( + 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, + 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, + 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, + 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, + 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, + 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, + 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, + 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, + 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, + 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, + 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, + 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, + 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, + 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, + 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, + 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, + 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, + 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, + 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, + 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, + 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, + 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, + 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, + 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, + 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, + 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, + 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, + 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, + 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, + 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, + 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, + 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04 +); +#enddecl(IQ3_XSS_GRID) + +#decl(IQ3_S_GRID) + +const iq3s_grid = array<u32, 512>( + 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305, + 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905, + 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09, + 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b, + 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b, + 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d, + 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03, + 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505, + 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03, + 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901, + 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d, + 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303, + 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501, + 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105, + 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505, + 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101, + 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707, + 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b, + 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01, + 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f, + 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305, + 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103, + 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509, + 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503, + 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b, + 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f, + 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f, + 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f, + 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109, + 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f, + 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509, + 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501, + 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303, + 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f, + 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907, + 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703, + 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03, + 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01, + 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01, + 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903, + 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505, + 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b, + 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107, + 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509, + 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303, + 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103, + 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05, + 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b, + 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f, + 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701, + 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909, + 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305, + 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d, + 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b, + 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d, + 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307, + 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09, + 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309, + 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709, + 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f, + 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303, + 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503, + 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b, + 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101 +); +#enddecl(IQ3_S_GRID) + +#decl(IQ1_GRID) + +const IQ1_DELTA: f32 = 0.125; + +const iq1_grid = array<u32, 1024>( + 0xfffdffff, 0xfff7fff0, 0xffccfff5, 0xffdfffc0, 0xffd7ffdd, 0xff30ffd5, 0xff03ff0c, 0xff10ff01, + 0xff7dff7f, 0xff75ff77, 0xff5fff40, 0xff57ff5d, 0xfcf3ff55, 0xfcccfcf0, 0xfcc1fcc3, 0xfcc5fcc4, + 0xfc3cfcd0, 0xfc34fc31, 0xfc00fc0d, 0xfc1cfc05, 0xfc11fc13, 0xfc70fc17, 0xfc43fc4c, 0xfc50fc41, + 0xfdfdfdff, 0xfdf5fdf7, 0xfddffdc0, 0xfdd7fddd, 0xfd30fdd5, 0xfd04fd0c, 0xfd14fd13, 0xfd7dfd7f, + 0xfd75fd77, 0xfd40fd4c, 0xfd5ffd44, 0xfd57fd5d, 0xf3ccfd55, 0xf3c1f3c3, 0xf33cf3d0, 0xf300f334, + 0xf313f305, 0xf34cf310, 0xf350f344, 0xf0f3f0fc, 0xf0f1f0f0, 0xf0c7f0c0, 0xf0d4f0c5, 0xf030f03f, + 0xf00ff035, 0xf003f00c, 0xf001f000, 0xf01ff004, 0xf010f01d, 0xf015f017, 0xf04cf07c, 0xf047f040, + 0xf05cf045, 0xf050f053, 0xf054f051, 0xf1c4f1c3, 0xf133f13c, 0xf10df10f, 0xf107f100, 0xf11cf11f, + 0xf114f111, 0xf14cf170, 0xf144f143, 0xf7fdf7ff, 0xf7f5f7f7, 0xf7dff7c0, 0xf7d7f7dd, 0xf730f7d5, + 0xf701f70c, 0xf77ff710, 0xf777f77d, 0xf740f775, 0xf75df75f, 0xf755f757, 0xf4ccf4f0, 0xf4c4f4c3, + 0xf4d0f4d3, 0xf40ff43c, 0xf400f40c, 0xf413f41c, 0xf44cf414, 0xf441f443, 0xf450f444, 0xf5fdf5ff, + 0xf5f5f5f7, 0xf5dff5c0, 0xf5d7f5dd, 0xf530f5d5, 0xf504f50c, 0xf510f51c, 0xf57df57f, 0xf577f570, + 0xf540f575, 0xf55df55f, 0xf555f557, 0xcfcccfcf, 0xcfc4cfc3, 0xcfd0cfd3, 0xcf33cf3c, 0xcf00cf0f, + 0xcf1ccf07, 0xcf10cf13, 0xcf4ccf14, 0xcf41cf43, 0xcf50cf5c, 0xccf3ccfc, 0xccf4ccf1, 0xcccdcccf, + 0xccc7ccc0, 0xccd3ccdc, 0xcc30ccd4, 0xcc0fcc35, 0xcc0dcc0c, 0xcc00cc03, 0xcc04cc01, 0xcc10cc1f, + 0xcc4dcc73, 0xcc5ccc40, 0xcdcccc53, 0xcdc1cdc3, 0xcd3fcdd0, 0xcd34cd31, 0xcd00cd0d, 0xcd05cd07, + 0xcd11cd13, 0xcd4ccd70, 0xcd41cd43, 0xc3fccd50, 0xc3f4c3f1, 0xc3c0c3c3, 0xc3c4c3c7, 0xc3d1c3dc, + 0xc330c33c, 0xc337c331, 0xc30cc335, 0xc300c303, 0xc304c301, 0xc310c31d, 0xc373c317, 0xc34fc374, + 0xc340c343, 0xc344c347, 0xc35cc345, 0xc350c353, 0xc0fdc354, 0xc0f5c0f0, 0xc0c3c0cc, 0xc0c1c0c0, + 0xc0dfc0c4, 0xc0d0c0dd, 0xc0d5c0d7, 0xc033c03c, 0xc031c030, 0xc00dc00c, 0xc000c003, 0xc004c001, + 0xc01cc005, 0xc010c013, 0xc014c011, 0xc07dc07f, 0xc070c073, 0xc075c077, 0xc04cc04f, 0xc040c043, + 0xc044c041, 0xc05fc045, 0xc050c05d, 0xc1f3c1fc, 0xc1f1c1f0, 0xc1c1c1c0, 0xc1c5c1c7, 0xc1d1c1dc, + 0xc13dc13f, 0xc130c133, 0xc135c137, 0xc100c10c, 0xc107c101, 0xc11cc104, 0xc110c113, 0xc114c117, + 0xc171c115, 0xc14dc175, 0xc153c140, 0xc7ccc154, 0xc7d0c7c1, 0xc733c73c, 0xc734c731, 0xc700c70f, + 0xc705c707, 0xc71cc71f, 0xc711c713, 0xc770c714, 0xc743c74c, 0xc4cfc750, 0xc4c0c4cd, 0xc4dcc4c5, + 0xc43dc4d0, 0xc430c433, 0xc40cc437, 0xc400c403, 0xc404c401, 0xc41fc405, 0xc415c410, 0xc44cc474, + 0xc440c44d, 0xc45cc447, 0xc454c451, 0xc5c1c5f4, 0xc5d1c5d3, 0xc531c533, 0xc50fc534, 0xc500c50d, + 0xc51cc507, 0xc514c511, 0xc54cc570, 0xc545c541, 0xdffddfff, 0xdff5dff7, 0xdfdfdfc0, 0xdfd0dfdd, + 0xdfd5dfd7, 0xdf0cdf30, 0xdf1cdf04, 0xdf7fdf10, 0xdf77df7d, 0xdf40df75, 0xdf5ddf5f, 0xdf57df50, + 0xdcf0df55, 0xdcc3dccc, 0xdcd0dcc4, 0xdc33dc3d, 0xdc00dc34, 0xdc05dc07, 0xdc13dc1c, 0xdc11dc10, + 0xdc4fdc70, 0xdc44dc41, 0xddfcdc50, 0xddf5ddf7, 0xddc0ddcc, 0xdddddddf, 0xddd5ddd7, 0xdd0cdd30, + 0xdd04dd01, 0xdd7cdd10, 0xdd75dd77, 0xdd40dd4c, 0xdd5ddd5f, 0xdd55dd57, 0xd3c3d3f0, 0xd3c4d3c1, + 0xd333d3d0, 0xd331d330, 0xd30dd334, 0xd307d300, 0xd311d305, 0xd34cd370, 0xd344d343, 0xd350d35c, + 0xd0c0d0f4, 0xd0d4d0dc, 0xd030d03f, 0xd00cd037, 0xd000d003, 0xd01dd004, 0xd017d010, 0xd04fd074, + 0xd040d043, 0xd045d047, 0xd053d05c, 0xd054d051, 0xd1cfd1f0, 0xd1c4d1cd, 0xd13cd1d0, 0xd100d134, + 0xd11cd11f, 0xd173d114, 0xd14fd171, 0xd7ffd145, 0xd7f7d7fd, 0xd7c0d7f5, 0xd7ddd7df, 0xd7d5d7d7, + 0xd70cd730, 0xd710d703, 0xd77dd77f, 0xd775d777, 0xd75dd75f, 0xd755d757, 0xd4ccd4f4, 0xd4c4d4c3, + 0xd431d4d0, 0xd40dd434, 0xd41cd400, 0xd411d413, 0xd470d414, 0xd441d44f, 0xd453d444, 0xd5ffd450, + 0xd5f7d5fd, 0xd5dfd5f5, 0xd5d7d5dd, 0xd530d5d5, 0xd501d50c, 0xd510d504, 0xd57dd57f, 0xd575d577, + 0xd55fd540, 0xd557d55d, 0x3ff0d555, 0x3fc13fcc, 0x3f343fd0, 0x3f003f0d, 0x3f053f07, 0x3f133f1c, + 0x3f433f11, 0x3f5c3f44, 0x3cff3f51, 0x3cf33cfc, 0x3cf43cf1, 0x3cc03ccd, 0x3cc73cc1, 0x3cdc3cc5, + 0x3cd43cd1, 0x3c373c30, 0x3c0c3c35, 0x3c003c03, 0x3c043c01, 0x3c103c05, 0x3c153c17, 0x3c733c7c, + 0x3c4f3c71, 0x3c403c4d, 0x3c5c3c5f, 0x3df03c5d, 0x3dc33dcc, 0x3dd03dc1, 0x3d0d3d3c, 0x3d053d00, + 0x3d143d13, 0x3d433d74, 0x33fc3d50, 0x33c433c0, 0x333033d4, 0x33353337, 0x3303330c, 0x33013300, + 0x331d331c, 0x33173310, 0x337c3315, 0x33743371, 0x334d334f, 0x335f3340, 0x3354335c, 0x30fd30fc, + 0x30f530f0, 0x30c330cc, 0x30c130c0, 0x30df30c4, 0x30d530d0, 0x3033303c, 0x30313030, 0x300f3034, + 0x3003300c, 0x30013000, 0x30043007, 0x3013301c, 0x30113010, 0x307d3014, 0x30703073, 0x304c3077, + 0x30403043, 0x30443041, 0x30503045, 0x30553057, 0x31f031fc, 0x31c331f4, 0x31c731c0, 0x31dc31c5, + 0x31d431d3, 0x313d313f, 0x31373130, 0x310c310f, 0x3100310d, 0x31043101, 0x3110311d, 0x317c3117, + 0x31753170, 0x31403143, 0x3153315c, 0x37f03151, 0x37c037cc, 0x37d037c5, 0x3734373d, 0x3700370f, + 0x371c3707, 0x37113713, 0x37703714, 0x3743374c, 0x37443741, 0x34fc3750, 0x34f134f0, 0x34cf34f5, + 0x34c034c3, 0x34dc34c7, 0x34d134d3, 0x3430343f, 0x340c3435, 0x3403340d, 0x34013400, 0x341f3404, + 0x3410341d, 0x34153411, 0x34743471, 0x3440344d, 0x34473441, 0x3453345c, 0x34543451, 0x353335c1, + 0x35343531, 0x35073500, 0x35133505, 0x35433514, 0x0ffc3550, 0x0ff00ff3, 0x0ff40ff1, 0x0fc00fcd, + 0x0fdc0fc5, 0x0fd40fd3, 0x0f300f3f, 0x0f0c0f37, 0x0f000f03, 0x0f040f01, 0x0f170f10, 0x0f740f71, + 0x0f470f40, 0x0f5c0f5f, 0x0f540f51, 0x0cf70cf0, 0x0cf50cf4, 0x0cc30ccc, 0x0cc10cc0, 0x0cc40cc7, + 0x0cd00cdf, 0x0cd70cd1, 0x0c3c0cd5, 0x0c300c33, 0x0c340c31, 0x0c0c0c0f, 0x0c030c0d, 0x0c010c00, + 0x0c040c07, 0x0c1c0c05, 0x0c100c13, 0x0c140c11, 0x0c700c7d, 0x0c430c4c, 0x0c410c40, 0x0c5f0c44, + 0x0c550c50, 0x0df10dfc, 0x0dc00dcd, 0x0ddc0dc5, 0x0d3d0dd3, 0x0d350d30, 0x0d030d0c, 0x0d010d00, + 0x0d1d0d04, 0x0d700d10, 0x0d4d0d4f, 0x0d440d40, 0x0d530d45, 0x03f003f3, 0x03c303cc, 0x03c103c0, + 0x03c403c7, 0x03d003dc, 0x03d503d7, 0x0333033c, 0x03310330, 0x03350334, 0x030c030f, 0x03000303, + 0x03070301, 0x03050304, 0x031d031c, 0x03100313, 0x03140311, 0x0377037f, 0x034c0375, 0x03400343, + 0x03440341, 0x0353035c, 0x03550350, 0x00fd00fc, 0x00f000f3, 0x00f400f1, 0x00cc00cf, 0x00c300cd, + 0x00c100c0, 0x00c500c4, 0x00d300dc, 0x00d100d0, 0x003f00d4, 0x003d003c, 0x00300033, 0x00370031, + 0x000f0034, 0x000d000c, 0x00000003, 0x00070001, 0x00050004, 0x001c001f, 0x00100013, 0x00170011, + 0x00150014, 0x0073007c, 0x00740070, 0x004f0075, 0x0043004c, 0x00410040, 0x00440047, 0x0053005c, + 0x00510050, 0x01ff0054, 0x01fd01fc, 0x01f101f3, 0x01f401f7, 0x01c301cc, 0x01c701c0, 0x01df01c4, + 0x01dd01dc, 0x01d001d3, 0x01d701d1, 0x013c01d4, 0x01310130, 0x01340137, 0x010f0135, 0x010d010c, + 0x01000103, 0x01070101, 0x01050104, 0x0113011c, 0x01140110, 0x0170017d, 0x01770171, 0x01750174, + 0x0140014c, 0x015d0145, 0x01510150, 0x01540157, 0x07f007f3, 0x07f407f1, 0x07c007cf, 0x07dc07c7, + 0x073007d5, 0x07350737, 0x0703070c, 0x07010700, 0x07040707, 0x071d071f, 0x07100713, 0x0774077d, + 0x074d074f, 0x07470740, 0x0754075c, 0x04fd04fc, 0x04f504f0, 0x04c304cc, 0x04c104c0, 0x04d004c4, + 0x0433043c, 0x04310430, 0x040f0434, 0x040d040c, 0x04000403, 0x04070401, 0x04050404, 0x0413041c, + 0x04110410, 0x047c0414, 0x04740470, 0x0443044c, 0x04410440, 0x04440447, 0x05f30450, 0x05c005f7, + 0x05df05c5, 0x05d105d0, 0x053005d4, 0x05340537, 0x0500050c, 0x05070501, 0x051d0504, 0x05170510, + 0x057c0515, 0x054d0575, 0x05410540, 0x05450547, 0x1ff0055c, 0x1fc11fc3, 0x1fd01fc4, 0x1f0f1f33, + 0x1f011f00, 0x1f051f07, 0x1f131f1c, 0x1f141f11, 0x1f411f7c, 0x1cfc1f50, 0x1cf11cf3, 0x1ccd1cf4, + 0x1cdc1cc0, 0x1cd11cdd, 0x1c301cd4, 0x1c0c1c34, 0x1c011c00, 0x1c101c04, 0x1c151c11, 0x1c751c73, + 0x1c401c4d, 0x1c511c5c, 0x1dcc1c54, 0x1dc41dc1, 0x1d3c1d3f, 0x1d001d31, 0x1d071d01, 0x1d701d1f, + 0x1d411d4c, 0x13cc1d50, 0x13c013cd, 0x13c513c1, 0x13d113dc, 0x133f13d4, 0x1330133d, 0x13351337, + 0x1303130c, 0x13011300, 0x13051304, 0x131d131f, 0x13731310, 0x13741370, 0x134d134f, 0x13401343, + 0x13471341, 0x135c1345, 0x13541353, 0x10f710f0, 0x10cc10f5, 0x10c110c0, 0x103310c4, 0x10311030, + 0x100f1034, 0x1003100c, 0x10011000, 0x101c1004, 0x10101013, 0x10141011, 0x10741071, 0x104c1075, + 0x10411040, 0x10451044, 0x1050105d, 0x10571051, 0x11f411fd, 0x11df11c0, 0x11d711d1, 0x113f11d4, + 0x11371130, 0x110c1135, 0x11001103, 0x11071101, 0x111f1105, 0x11171110, 0x117d117f, 0x11751170, + 0x11411143, 0x11441147, 0x1153115f, 0x11551151, 0x17c417c1, 0x173c17d0, 0x1700170d, 0x171c1705, + 0x17701714, 0x1747174c, 0x14fc1751, 0x14cf14f3, 0x14dc14c0, 0x14d114d3, 0x143f14d4, 0x1430143c, + 0x14371431, 0x1403140c, 0x14011400, 0x141f1404, 0x14151410, 0x1473147d, 0x14401475, 0x1453145c, + 0x14541450, 0x15c115cc, 0x153c15c7, 0x15341533, 0x1500150f, 0x15051507, 0x15101513, 0x15711514, + 0x15471543, 0x15511545, 0x7ffd7fff, 0x7ff57ff7, 0x7fdd7fdf, 0x7fd57fd7, 0x7f0f7f30, 0x7f037f0c, + 0x7f047f01, 0x7f7f7f10, 0x7f777f7d, 0x7f407f75, 0x7f5d7f5f, 0x7f557f57, 0x7ccc7cf0, 0x7cc17cc3, + 0x7cd07cc4, 0x7c337c3c, 0x7c0f7c34, 0x7c007c0d, 0x7c077c01, 0x7c137c04, 0x7c147c11, 0x7c747c70, + 0x7c417c43, 0x7c507c44, 0x7dfd7dff, 0x7df57df7, 0x7ddf7dc0, 0x7dd77ddd, 0x7d0c7dd5, 0x7d047d03, + 0x7d7f7d10, 0x7d777d7d, 0x7d407d75, 0x7d5d7d5f, 0x7d557d57, 0x73c473c3, 0x7333733c, 0x7300730c, + 0x731c7305, 0x73147313, 0x73447343, 0x70f470fc, 0x70c070cd, 0x70d170c5, 0x703f70d4, 0x7030703c, + 0x700c7037, 0x70007003, 0x70047001, 0x70107005, 0x70177011, 0x707c7015, 0x70717073, 0x704f7074, + 0x7040704d, 0x70517047, 0x71c171cc, 0x71d071c4, 0x7133713c, 0x71357134, 0x7100710f, 0x71057104, + 0x7111711c, 0x71707115, 0x7145714c, 0x77ff7153, 0x77f777fd, 0x77c077f5, 0x77dd77df, 0x77d577d7, + 0x7730773c, 0x7703770c, 0x77107704, 0x777f7714, 0x7777777d, 0x77407775, 0x775d775f, 0x77557757, + 0x74f174f0, 0x74c374cc, 0x74d074c1, 0x7433743c, 0x74347431, 0x740d740f, 0x74057400, 0x7413741c, + 0x74417470, 0x74507444, 0x75fd75ff, 0x75f575f7, 0x75df75c0, 0x75d775dd, 0x753075d5, 0x7503750c, + 0x757f7501, 0x7577757d, 0x75407575, 0x755d755f, 0x75557557, 0x4fcc4ff0, 0x4fc74fc1, 0x4fd04fc4, + 0x4f314f3c, 0x4f004f34, 0x4f054f07, 0x4f154f14, 0x4f4c4f70, 0x4f414f43, 0x4f504f44, 0x4cf34cfc, + 0x4cf44cf1, 0x4cc04ccf, 0x4cc54cc7, 0x4cd34cdc, 0x4cd44cd1, 0x4c304c3f, 0x4c0c4c0f, 0x4c004c03, + 0x4c044c01, 0x4c104c1d, 0x4c714c73, 0x4c404c4d, 0x4c5c4c47, 0x4c514c53, 0x4df04c54, 0x4dc34dcc, + 0x4dd04dc4, 0x4d314d33, 0x4d0f4d34, 0x4d004d0d, 0x4d114d07, 0x4d704d14, 0x4d414d43, 0x43fc4d54, + 0x43f143f3, 0x43c043cf, 0x43d143c7, 0x4335433f, 0x4303430c, 0x43014300, 0x43044307, 0x431c431f, + 0x4310431d, 0x43714373, 0x4343434d, 0x43474340, 0x4354435c, 0x40f040ff, 0x40f540f7, 0x40cc40cf, + 0x40c040c3, 0x40c440c1, 0x40d040dc, 0x40d540d4, 0x4033403c, 0x40314030, 0x400f4034, 0x400d400c, + 0x40004003, 0x40074001, 0x40054004, 0x4013401c, 0x40114010, 0x407c4014, 0x40774070, 0x404d404c, + 0x40404043, 0x40444041, 0x405f4045, 0x4050405d, 0x40554057, 0x41f341fc, 0x41c041cf, 0x41df41c4, + 0x41d441d1, 0x41374130, 0x410c4134, 0x4100410d, 0x41044101, 0x41174110, 0x4173417d, 0x41754174, + 0x4143414d, 0x41534140, 0x41544151, 0x47c147f0, 0x47d047c4, 0x4731473c, 0x470d470f, 0x47014700, + 0x47134705, 0x47704710, 0x4741474c, 0x47504744, 0x44f144f3, 0x44cf44f4, 0x44c044cd, 0x44c544c7, + 0x44dc44df, 0x44d144d3, 0x443d443f, 0x44374430, 0x440c4435, 0x44004403, 0x44044401, 0x4410441d, + 0x44154411, 0x4473447c, 0x444d444f, 0x44454440, 0x4451445c, 0x45c045f0, 0x453345d0, 0x45344531, + 0x4500450f, 0x451c4507, 0x454c4570, 0x45404543, 0x5fff4541, 0x5ff75ffd, 0x5fc05ff5, 0x5fdd5fdf, + 0x5fd55fd7, 0x5f0c5f30, 0x5f015f03, 0x5f7f5f04, 0x5f775f7d, 0x5f405f75, 0x5f5d5f5f, 0x5f555f57, + 0x5cf45cf0, 0x5cc35ccc, 0x5cc45cc1, 0x5c315cc5, 0x5c0c5c34, 0x5c075c00, 0x5c1c5c05, 0x5c705c13, + 0x5c4d5c4f, 0x5c445c41, 0x5df75dfd, 0x5dcf5df5, 0x5ddd5dc4, 0x5dd55dd7, 0x5d0c5d30, 0x5d045d01, + 0x5d7f5d10, 0x5d775d7d, 0x5d405d75, 0x5d5d5d5f, 0x5d555d57, 0x53d053c4, 0x5333533c, 0x5303530f, + 0x53075300, 0x531c5305, 0x53115310, 0x53145317, 0x50f15370, 0x50cf50f4, 0x50c050cd, 0x50d150c7, + 0x503d50d4, 0x500c5030, 0x50005003, 0x50045001, 0x50155010, 0x5073507c, 0x50715070, 0x504d5074, + 0x50475040, 0x51cc51f0, 0x51c551c1, 0x51d051dc, 0x51315133, 0x510d5135, 0x51015100, 0x511f5107, + 0x5171511d, 0x5140514f, 0x51445141, 0x5153515c, 0x57ff5151, 0x57f757fd, 0x57df57f5, 0x57d757dd, + 0x570c57d5, 0x57015703, 0x577f5704, 0x5777577d, 0x57405775, 0x575d575f, 0x57555757, 0x54c354f0, + 0x54dc54c4, 0x543c54d0, 0x5400540f, 0x541c5405, 0x54145411, 0x5441544f, 0x55fd55ff, 0x55f555f7, + 0x55dd55df, 0x55d555d7, 0x5503550c, 0x557f5501, 0x5577557d, 0x55405575, 0x555d555f, 0x55555557 +); + +#enddecl(IQ1_GRID) + +#decl(IQ4_GRID) + +const kvalues_iq4nl = array<i32, 16>( + -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113 +); + +#enddecl(IQ4_GRID) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl new file mode 100644 index 0000000..b5e93b8 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl @@ -0,0 +1,107 @@ +#define(VARIANTS) + +[ + { + "REPLS": { + "SRC_TYPE": "f32", + "DST_TYPE": "f32" + } + }, + { + "REPLS": { + "SRC_TYPE": "f32", + "DST_TYPE": "i32" + } + }, + { + "REPLS": { + "SRC_TYPE": "f32", + "DST_TYPE": "f16" + } + }, + { + "REPLS": { + "SRC_TYPE": "f16", + "DST_TYPE": "f16" + } + }, + { + "REPLS": { + "SRC_TYPE": "f16", + "DST_TYPE": "f32" + } + } +] + +#end(VARIANTS) + +#define(SHADER) +enable f16; + +@group(0) @binding(0) +var<storage, read_write> src: array<{{SRC_TYPE}}>; + +@group(0) @binding(1) +var<storage, read_write> dst: array<{{DST_TYPE}}>; + +struct Params { + ne: u32, // total number of elements + offset_src: u32, // in elements + offset_dst: u32, // in elements + + // Strides (in elements) — may be permuted + stride_src0: u32, + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + stride_dst0: u32, + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + // Logical shapes + src_ne0: u32, + src_ne1: u32, + src_ne2: u32, + + dst_ne0: u32, + dst_ne1: u32, + dst_ne2: u32 +}; + +@group(0) @binding(2) +var<uniform> params: Params; + +override wg_size: u32; +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3<u32>) { + if (gid.x >= params.ne) { + return; + } + + var i = gid.x; + let i3 = i / (params.src_ne2 * params.src_ne1 * params.src_ne0); + i = i % (params.src_ne2 * params.src_ne1 * params.src_ne0); + let i2 = i / (params.src_ne1 * params.src_ne0); + i = i % (params.src_ne1 * params.src_ne0); + let i1 = i / params.src_ne0; + let i0 = i % params.src_ne0; + + var j = gid.x; + let j3 = j / (params.dst_ne2 * params.dst_ne1 * params.dst_ne0); + j = j % (params.dst_ne2 * params.dst_ne1 * params.dst_ne0); + let j2 = j / (params.dst_ne1 * params.dst_ne0); + j = j % (params.dst_ne1 * params.dst_ne0); + let j1 = j / params.dst_ne0; + let j0 = j % params.dst_ne0; + + let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 + + i2 * params.stride_src2 + i3 * params.stride_src3; + + let dst_idx = j0 * params.stride_dst0 + j1 * params.stride_dst1 + + j2 * params.stride_dst2 + j3 * params.stride_dst3; + + dst[params.offset_dst + dst_idx] = {{DST_TYPE}}((src[params.offset_src + src_idx])); +} +#end(SHADER) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl new file mode 100644 index 0000000..e622552 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl @@ -0,0 +1,66 @@ +@group(0) @binding(0) +var<storage, read_write> src: array<f32>; + +@group(0) @binding(1) +var<storage, read_write> dst: array<f32>; + +struct Params { + offset_src: u32, // in elements + offset_dst: u32, // in elements + ne0: u32, +}; + +@group(0) @binding(2) +var<uniform> params: Params; + +var<workgroup> shared_sum: array<f32, WG_SIZE>; + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(workgroup_id) wid: vec3<u32>, + @builtin(local_invocation_id) lid: vec3<u32>) { + let row_idx = params.offset_src + wid.x * params.ne0; + let elems = (params.ne0 + WG_SIZE - 1) / WG_SIZE; + var local_sum: f32 = 0.0; + for (var col = lid.x * elems; col < (lid.x + 1) * elems && col < params.ne0; col ++) { + local_sum += src[row_idx + col]; + } + shared_sum[lid.x] = local_sum; + workgroupBarrier(); + + // upsweep + var offset = 1u; + while (offset < WG_SIZE) { + let idx = (lid.x + 1) * offset * 2 - 1; + if (idx < WG_SIZE) { + shared_sum[idx] = shared_sum[idx] + shared_sum[idx - offset]; + } + workgroupBarrier(); + offset <<= 1; + } + + // set last to 0 for exclusive sum + if (lid.x == 0) { + shared_sum[WG_SIZE - 1] = 0.0; + } + workgroupBarrier(); + + // downsweep + offset = WG_SIZE >> 1; + while (offset > 0) { + let idx = (lid.x + 1) * offset * 2 - 1; + if (idx < WG_SIZE) { + let t = shared_sum[idx - offset]; + shared_sum[idx - offset] = shared_sum[idx]; + shared_sum[idx] = shared_sum[idx] + t; + } + workgroupBarrier(); + offset = offset >> 1; + } + + // shared_sum[lid] is exclusive prefix sum up to this thread. + var running_sum = shared_sum[lid.x]; + for (var col = lid.x * elems; col < (lid.x + 1) * elems && col < params.ne0; col ++) { + running_sum += src[row_idx + col]; + dst[params.offset_dst + wid.x * params.ne0 + col] = running_sum; + } +} diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py new file mode 100755 index 0000000..d61df5b --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py @@ -0,0 +1,147 @@ +import os +import re +import ast +import argparse + + +def extract_block(text, name): + pattern = rf'#define\({name}\)\s*(.*?)#end\({name}\)' + match = re.search(pattern, text, re.DOTALL) + if not match: + raise ValueError(f"Missing block: {name}") + return match.group(1).strip() + + +def parse_decls(decls_text): + decls = {} + for name, code in re.findall(r'#decl\((.*?)\)\s*(.*?)#enddecl\(\1\)', decls_text, re.DOTALL): + decls[name.strip()] = code.strip() + return decls + + +def replace_repl_placeholders(variant, template_map): + for repl, code in variant["REPLS"].items(): + for key, val in template_map.items(): + # Match "key" and avoid matching subsequences using by using \b + code = re.sub(rf'\b{re.escape(str(key))}\b', str(val), code) + variant["REPLS"][repl] = code + return variant + + +def replace_placeholders(shader_text, replacements): + for key, val in replacements.items(): + # Match {{KEY}} literally, where KEY is escaped + pattern = r'{{\s*' + re.escape(key) + r'\s*}}' + shader_text = re.sub(pattern, str(val), shader_text) + return shader_text + + +def expand_includes(shader, input_dir): + """ + Replace #include "file" lines in the text with the contents of that file. + Searches for files relative to input_dir. + """ + include_pattern = re.compile(r'^\s*#include\s+"([^"]+)"\s*$', re.MULTILINE) + + def replacer(match): + fname = match.group(1) + file_path = os.path.join(input_dir, fname) + if not os.path.exists(file_path): + raise FileNotFoundError(f"Included file not found: {file_path}") + with open(file_path, "r", encoding="utf-8") as f: + included_code = f.read() + # Recursively expand includes inside the included file + return expand_includes(included_code, input_dir) + + return include_pattern.sub(replacer, shader) + + +def write_shader(shader_name, shader_code, output_dir, outfile): + if output_dir: + wgsl_filename = os.path.join(output_dir, f"{shader_name}.wgsl") + with open(wgsl_filename, "w", encoding="utf-8") as f_out: + f_out.write(shader_code) + outfile.write(f'const char* wgsl_{shader_name} = R"({shader_code})";\n\n') + + +def generate_variants(fname, input_dir, output_dir, outfile): + shader_path = os.path.join(input_dir, fname) + shader_base_name = fname.split(".")[0] + + with open(shader_path, "r", encoding="utf-8") as f: + text = f.read() + + try: + variants = ast.literal_eval(extract_block(text, "VARIANTS")) + except ValueError: + write_shader(shader_base_name, text, output_dir, outfile) + else: + try: + decls_map = parse_decls(extract_block(text, "DECLS")) + except ValueError: + decls_map = {} + try: + templates_map = ast.literal_eval(extract_block(text, "REPL_TEMPLATES")) + except ValueError: + templates_map = {} + + for fname in sorted(os.listdir(input_dir)): + if fname.endswith(".tmpl"): + tmpl_path = os.path.join(input_dir, fname) + with open(tmpl_path, "r", encoding="utf-8") as f_tmpl: + decls = f_tmpl.read() + decls_map.update(parse_decls(decls)) + + shader_template = extract_block(text, "SHADER") + for variant in variants: + if "DECLS" in variant: + decls = variant["DECLS"] + else: + decls = [] + decls_code = "" + for key in decls: + if key not in decls_map: + raise ValueError(f"DECLS key '{key}' not found.") + decls_code += decls_map[key] + "\n\n" + final_shader = re.sub(r'\bDECLS\b', decls_code, shader_template) + if "REPLS" in variant: + variant = replace_repl_placeholders(variant, templates_map) + final_shader = replace_placeholders(final_shader, variant["REPLS"]) + # second run to expand placeholders in repl_template + final_shader = replace_placeholders(final_shader, variant["REPLS"]) + final_shader = expand_includes(final_shader, input_dir) + + if "SHADER_NAME" in variant: + output_name = variant["SHADER_NAME"] + elif "SHADER_SUFFIX" in variant: + output_name = f"{shader_base_name}_" + variant["SHADER_SUFFIX"] + elif "REPLS" in variant and "SRC0_TYPE" in variant["REPLS"] and "SRC1_TYPE" in variant["REPLS"]: + output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]]) + elif "REPLS" in variant and "SRC_TYPE" in variant["REPLS"] and "DST_TYPE" in variant["REPLS"]: + output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC_TYPE"], variant["REPLS"]["DST_TYPE"]]) + elif "REPLS" in variant and "TYPE" in variant["REPLS"]: + output_name = f"{shader_base_name}_" + variant["REPLS"]["TYPE"] + else: + output_name = shader_base_name + write_shader(output_name, final_shader, output_dir, outfile) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--input_dir", required=True) + parser.add_argument("--output_file", required=True) + parser.add_argument("--output_dir") + args = parser.parse_args() + + if args.output_dir: + os.makedirs(args.output_dir, exist_ok=True) + + with open(args.output_file, "w", encoding="utf-8") as out: + out.write("// Auto-generated shader embedding\n\n") + for fname in sorted(os.listdir(args.input_dir)): + if fname.endswith(".wgsl"): + generate_variants(fname, args.input_dir, args.output_dir, out) + + +if __name__ == "__main__": + main() diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl new file mode 100644 index 0000000..b682216 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl @@ -0,0 +1,636 @@ +diagnostic(off, chromium.subgroup_matrix_uniformity); +diagnostic(off, subgroup_uniformity); +enable f16; +enable subgroups; +enable chromium_experimental_subgroup_matrix; + +#ifdef KV_F32 +#define KV_TYPE f32 +#else +#define KV_TYPE f16 +#endif + +// Default values +#define HEAD_DIM_QK 64 +#define HEAD_DIM_V 64 + +// The number of rows/columns/k in a subgroup matrix. MxK * KxN = MxN +// Note that the "K" here does not correspond to the K in attention's Q/K/V, it's just the common dimension. +#define SG_MAT_M 8 +#define SG_MAT_N 8 +#define SG_MAT_K 8 + +// Each workgroup processes one subgroup matrix of Q rows +#define Q_TILE SG_MAT_M +#define KV_TILE 16 +#define WG_SIZE 64 + +// Number of subgroup-matrix-width blocks that span the KV tile. SG_MAT_N must divide KV_TILE. +#define KV_BLOCKS (KV_TILE / SG_MAT_N) + +// Quantization constants/helpers +#define BLOCK_SIZE 32 +#define BLOCKS_K ((HEAD_DIM_QK + BLOCK_SIZE - 1) / BLOCK_SIZE) +#define BLOCKS_V ((HEAD_DIM_V + BLOCK_SIZE - 1) / BLOCK_SIZE) +// number of quantized elements processed per thread +#if defined(KV_Q4_0) +#define NQ 16 +// Q4_0 has 32 elements, 1 f16 for scale, 8 f16 for 4-bit weights +#define F16_PER_BLOCK 9 +#define WEIGHTS_PER_F16 4 +#elif defined(KV_Q8_0) +#define NQ 8 +// Q8_0 has 32 elements, 1 f16 for scale, 16 f16 for 8-bit weights +#define F16_PER_BLOCK 17 +#define WEIGHTS_PER_F16 2 +#endif +#define F16_PER_THREAD (NQ / WEIGHTS_PER_F16) + +// Ok not to put these in a define block, compiler will remove if unused +fn get_byte(value: u32, index: u32) -> u32 { + return (value >> (index * 8)) & 0xFF; +} + +fn get_byte_i32(value: u32, index: u32) -> i32 { + return bitcast<i32>(((value >> (index * 8)) & 0xFF) << 24) >> 24; +} + +struct Params { + offset_q: u32, + offset_k: u32, + offset_v: u32, + offset_mask: u32, + offset_sinks: u32, + offset_dst: u32, + + // shapes of Q/K/V + n_heads: u32, + seq_len_q: u32, + seq_len_kv: u32, + + // strides (in elements) + stride_q1: u32, + stride_q2: u32, + stride_q3: u32, + stride_k1: u32, + stride_k2: u32, + stride_k3: u32, + stride_v1: u32, + stride_v2: u32, + stride_v3: u32, + stride_mask3: u32, + + // repeat factors for K/V, e.g., MHA vs. MQA vs. GQA + q_per_kv: u32, + + // softmax params + scale: f32, + max_bias: f32, + logit_softcap: f32, + n_head_log2: f32, + m0: f32, + m1: f32, +}; + +@group(0) @binding(0) var<storage, read_write> Q: array<f32>; +@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>; +@group(0) @binding(2) var<storage, read_write> V: array<KV_TYPE>; + +#if defined(MASK) && defined(SINKS) +@group(0) @binding(3) var<storage, read_write> mask: array<f16>; +@group(0) @binding(4) var<storage, read_write> sinks: array<f32>; +#define DST_BINDING 5 +#define PARAMS_BINDING 6 +#elif defined(MASK) +@group(0) @binding(3) var<storage, read_write> mask: array<f16>; +#define DST_BINDING 4 +#define PARAMS_BINDING 5 +#elif defined(SINKS) +@group(0) @binding(3) var<storage, read_write> sinks: array<f32>; +#define DST_BINDING 4 +#define PARAMS_BINDING 5 +#else +#define DST_BINDING 3 +#define PARAMS_BINDING 4 +#endif + +@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>; +@group(0) @binding(PARAMS_BINDING) var<uniform> params: Params; + +// Just a very small float value. +const FLOAT_MIN: f32 = -1.0e9; + +// The number of Q rows processed per workgroup +var<workgroup> q_shmem: array<f16, Q_TILE * HEAD_DIM_QK>; + +#ifndef KV_DIRECT +const kv_shmem_size = KV_TILE * max(HEAD_DIM_QK, HEAD_DIM_V); +// we can reuse the same shmem for K and V since we only need one at a time +var<workgroup> kv_shmem: array<f16, kv_shmem_size>; +#endif + +var<workgroup> o_shmem: array<f16, Q_TILE * HEAD_DIM_V>; // output shmem + +#ifdef MASK +// storage for mask values +var<workgroup> mask_shmem: array<f16, Q_TILE * KV_TILE>; +#endif + +// storage for output of Q*K^T scores for online softmax (S matrix from paper) +// also storage for diagonal matrix during online softmax (P matrix from paper) +// note that we reuse the same storage for both since we only need one at a time +var<workgroup> inter_shmem: array<f16, Q_TILE * KV_TILE>; + +// Storage for row max and exp sum during online softmax +var<workgroup> row_max_shmem: array<f32, Q_TILE>; +var<workgroup> exp_sum_shmem: array<f32, Q_TILE>; + +fn calc_softmax_term(kv_idx: u32, q_tile_row: u32, slope: f32) -> f32 { + var v = select(FLOAT_MIN, + f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]) * params.scale, + kv_idx < KV_TILE); +#ifdef LOGIT_SOFTCAP + v = params.logit_softcap * tanh(v); +#endif +#ifdef MASK + let mask_val = select(0.0, f32(mask_shmem[q_tile_row * KV_TILE + kv_idx]), kv_idx < KV_TILE); + let mask_term = slope * mask_val; + v += mask_term; +#endif + return v; +} + +fn load_f32x4(buf: ptr<storage, array<vec4<f32>>, read_write>, scalar_index: u32) -> vec4<f32> { + return (*buf)[scalar_index >> 2u]; +} + +fn load_kvx4(buf: ptr<storage, array<vec4<KV_TYPE>>, read_write>, scalar_index: u32) -> vec4<KV_TYPE> { + return (*buf)[scalar_index >> 2u]; +} + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(workgroup_id) wg_id: vec3<u32>, + @builtin(local_invocation_id) local_id: vec3<u32>, + @builtin(subgroup_id) subgroup_id: u32, + @builtin(subgroup_size) subgroup_size: u32, + @builtin(num_subgroups) num_subgroups: u32, + @builtin(subgroup_invocation_id) sg_inv_id: u32) { + + // initialize row max for online softmax + for (var i = local_id.x; i < Q_TILE; i += WG_SIZE) { + row_max_shmem[i] = FLOAT_MIN; + exp_sum_shmem[i] = 0.0; + } + + for (var i = local_id.x; i < Q_TILE * HEAD_DIM_V; i += WG_SIZE) { + o_shmem[i] = 0.0; + } + + // workgroups per head/batch + let wg_per_head = (params.seq_len_q + Q_TILE - 1u) / Q_TILE; + let wg_per_batch = wg_per_head * params.n_heads; + + let dst2_stride = HEAD_DIM_V * params.n_heads; + let dst3_stride = dst2_stride * params.seq_len_q; + + // batch index + let batch_idx = wg_id.x / wg_per_batch; + let q_batch_offset = params.offset_q + batch_idx * params.stride_q3; + let k_batch_offset = params.offset_k + batch_idx * params.stride_k3; + let v_batch_offset = params.offset_v + batch_idx * params.stride_v3; + let dst_batch_offset = params.offset_dst + batch_idx * dst3_stride; + let wg_in_batch = wg_id.x % wg_per_batch; + + // head index + let head_idx = wg_in_batch / wg_per_head; + let q_head_offset = q_batch_offset + head_idx * params.stride_q2; + let k_head_idx = head_idx / params.q_per_kv; + let v_head_idx = k_head_idx; + let k_head_offset = k_batch_offset + k_head_idx * params.stride_k2; + let v_head_offset = v_batch_offset + v_head_idx * params.stride_v2; + + // starting Q row for this workgroup + let wg_in_head = wg_in_batch % wg_per_head; + let q_row_start = wg_in_head * Q_TILE; + +#ifdef MASK + // mask offset + let mask_global_offset = params.offset_mask + batch_idx * params.stride_mask3 + q_row_start * params.seq_len_kv; +#endif + + // note that the output is permuted, the layout is [head_dim_v, n_heads, seq_len_q, batch_size] + let dst_global_offset = dst_batch_offset + q_row_start * dst2_stride + head_idx * HEAD_DIM_V; + + let head = f32(head_idx); + let slope = select(1.0, select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0), pow(params.m0, head + 1.0), head < params.n_head_log2), params.max_bias > 0); + + // load q tile into shared memory + for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) { + let q_row = elem_idx / HEAD_DIM_QK; + let q_col = elem_idx % HEAD_DIM_QK; + let head_q_row = q_row_start + q_row; + let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1; + q_shmem[elem_idx] = f16(select( + 0.0, + Q[global_q_row_offset + q_col], + head_q_row < params.seq_len_q && q_col < HEAD_DIM_QK)); + } + + for (var kv_tile = 0u; kv_tile < params.seq_len_kv; kv_tile += KV_TILE) { + // clear inter_shmem to ensure zero-initialized accumulators + for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) { + inter_shmem[elem_idx] = 0.0; + } + + // load k tile into shared memory +#if defined(KV_Q4_0) + for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE * NQ) { + let blck_idx = elem_idx / BLOCK_SIZE; + let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16; + let k_row = blck_idx / BLOCKS_K; + let global_k_row = kv_tile + k_row; + let block_k = blck_idx % BLOCKS_K; + let row_offset = k_row * HEAD_DIM_QK; + + if (global_k_row < params.seq_len_kv) { + let global_block_idx = k_head_offset + global_k_row * params.stride_k1 + block_k; + let base_idx = global_block_idx * F16_PER_BLOCK; + let d = K[base_idx]; // scale + for (var j = 0u; j < F16_PER_THREAD; j += 2) { + let q_0 = K[base_idx + 1u + block_offset + j]; + let q_1 = K[base_idx + 1u + block_offset + j + 1]; + let q_packed = bitcast<u32>(vec2(q_0, q_1)); + for (var k = 0u; k < 4u; k++) { + let q_byte = get_byte(q_packed, k); + let q_hi = (f16((q_byte >> 4) & 0xF) - 8.0) * d; + let q_lo = (f16(q_byte & 0xF) - 8.0) * d; + let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k; + kv_shmem[row_offset + idx] = q_lo; + kv_shmem[row_offset + idx + 16u] = q_hi; + } + } + } + } +#elif defined(KV_Q8_0) + for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE * NQ) { + let blck_idx = elem_idx / BLOCK_SIZE; + let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16; + let k_row = blck_idx / BLOCKS_K; + let global_k_row = kv_tile + k_row; + let block_k = blck_idx % BLOCKS_K; + let row_offset = k_row * HEAD_DIM_QK; + + if (global_k_row < params.seq_len_kv) { + let global_block_idx = k_head_offset + global_k_row * params.stride_k1 + block_k; + let base_idx = global_block_idx * F16_PER_BLOCK; + let d = K[base_idx]; // scale + for (var j = 0u; j < F16_PER_THREAD; j += 2) { + let q_0 = K[base_idx + 1u + block_offset + j]; + let q_1 = K[base_idx + 1u + block_offset + j + 1]; + let q_packed = bitcast<u32>(vec2(q_0, q_1)); + for (var k = 0u; k < 4u; k++) { + let q_byte = get_byte_i32(q_packed, k); + let q_val = f16(q_byte) * d; + let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k; + kv_shmem[row_offset + idx] = q_val; + } + } + } + } +#elif defined(KV_DIRECT) + // Direct global loads for KV +#else + for (var elem_idx = local_id.x; elem_idx < KV_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) { + let k_row = elem_idx / HEAD_DIM_QK; + let k_col = elem_idx % HEAD_DIM_QK; + let global_k_row = kv_tile + k_row; + let global_k_row_offset = k_head_offset + global_k_row * params.stride_k1; + kv_shmem[elem_idx] = f16(select( + 0.0, + K[global_k_row_offset + k_col], + global_k_row < params.seq_len_kv && k_col < HEAD_DIM_QK)); + } +#endif + + workgroupBarrier(); + + // accumulate q block * k block into registers across the entire KV tile + // TODO: this loop seems to be the current largest bottleneck + // this bracket exists to scope the lifetime of variables, reducing register pressure + { +#ifdef KV_DIRECT + let k_block_row = kv_tile + subgroup_id * SG_MAT_N; + var k_global_offset = k_head_offset + k_block_row * params.stride_k1; +#else + var k_block_offset = subgroup_id * SG_MAT_N * HEAD_DIM_QK; +#endif + for (var kv_block = subgroup_id; kv_block < KV_BLOCKS; kv_block += num_subgroups) { + let inter_offset = kv_block * SG_MAT_N; + var acc: subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N>>(&inter_shmem, inter_offset, false, KV_TILE); + + var q_cur = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, 0u, false, HEAD_DIM_QK); + +#ifdef KV_DIRECT + var k_cur = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + 0u, true, params.stride_k1); +#else + var k_cur = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + 0u, true, HEAD_DIM_QK); +#endif + + var t: u32 = 1u; + for (; t + 1u < HEAD_DIM_QK / SG_MAT_K; t += 2u) { + let h0 = t * SG_MAT_K; + var q0 = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h0, false, HEAD_DIM_QK); +#ifdef KV_DIRECT + var k0 = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h0, true, params.stride_k1); +#else + var k0 = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h0, true, HEAD_DIM_QK); +#endif + acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc); + q_cur = q0; + k_cur = k0; + + let h1 = (t + 1u) * SG_MAT_K; + var q1g = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h1, false, HEAD_DIM_QK); +#ifdef KV_DIRECT + var k1g = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h1, true, params.stride_k1); +#else + var k1g = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h1, true, HEAD_DIM_QK); +#endif + acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc); + q_cur = q1g; + k_cur = k1g; + } + + // handle odd tail + if (t < HEAD_DIM_QK / SG_MAT_K) { + let h = t * SG_MAT_K; + var qn = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>(&q_shmem, h, false, HEAD_DIM_QK); +#ifdef KV_DIRECT + var kn = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&K, k_global_offset + h, true, params.stride_k1); +#else + var kn = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>(&kv_shmem, k_block_offset + h, true, HEAD_DIM_QK); +#endif + acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc); + q_cur = qn; + k_cur = kn; + } + + acc = subgroupMatrixMultiplyAccumulate(q_cur, k_cur, acc); + +#ifdef KV_DIRECT + k_global_offset += num_subgroups * SG_MAT_N * params.stride_k1; +#else + k_block_offset += num_subgroups * SG_MAT_N * HEAD_DIM_QK; +#endif + subgroupMatrixStore(&inter_shmem, inter_offset, acc, false, KV_TILE); + } + } + + +#ifdef MASK + // load mask tile into shared memory for this KV block + // TODO: optimize and skip if mask is -INF for the entire tile + for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) { + let mask_row = elem_idx / KV_TILE; + let mask_col = elem_idx % KV_TILE; + let global_q_row = q_row_start + mask_row; + let global_k_col = kv_tile + mask_col; + let mask_in_bounds = global_q_row < params.seq_len_q && global_k_col < params.seq_len_kv; + let mask_idx = mask_global_offset + mask_row * params.seq_len_kv + global_k_col; + mask_shmem[elem_idx] = select(0.0, mask[mask_idx], mask_in_bounds); + } +#endif + + workgroupBarrier(); + + // online softmax + for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) { + let global_q_row = q_row_start + q_tile_row; + if (global_q_row >= params.seq_len_q) { + break; + } + + // initialize running max for this row + var prev_max = row_max_shmem[q_tile_row]; + var final_max = prev_max; + // pass 1: compute final max across the full KV tile in chunks + for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) { + let kv_idx = kv_offset + sg_inv_id; + let softmax_term = calc_softmax_term(kv_idx, q_tile_row, slope); + final_max = subgroupMax(max(final_max, softmax_term)); + } + + var total_exp_term: f32 = 0.0; + // pass 2: compute exp sum and write P using final_max + for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) { + let kv_idx = kv_offset + sg_inv_id; + let softmax_term = calc_softmax_term(kv_idx, q_tile_row, slope); + let cur_p = select(0.0, + exp(softmax_term - final_max), + kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE); + total_exp_term += subgroupAdd(cur_p); + if (kv_idx < KV_TILE) { + inter_shmem[kv_idx + q_tile_row * KV_TILE] = f16(cur_p); + } + } + + let cur_exp = exp(prev_max - final_max); + + if (sg_inv_id == 0) { + row_max_shmem[q_tile_row] = final_max; + exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * cur_exp + total_exp_term; + } + + for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) { + let idx = q_tile_row * HEAD_DIM_V + elem_idx; + o_shmem[idx] = f16(f32(o_shmem[idx]) * cur_exp); + } + } + + // load v tile into shared memory +#if defined(KV_Q4_0) + for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_V; elem_idx += WG_SIZE * NQ) { + let blck_idx = elem_idx / BLOCK_SIZE; + let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16; + let v_row = blck_idx / BLOCKS_V; + let global_v_row = kv_tile + v_row; + let block_k = blck_idx % BLOCKS_V; + let row_offset = v_row * HEAD_DIM_V; + + if (global_v_row < params.seq_len_kv) { + let global_block_idx = v_head_offset + global_v_row * params.stride_v1 + block_k; + let base_idx = global_block_idx * F16_PER_BLOCK; + let d = V[base_idx]; // scale + for (var j = 0u; j < F16_PER_THREAD; j += 2) { + let q_0 = V[base_idx + 1u + block_offset + j]; + let q_1 = V[base_idx + 1u + block_offset + j + 1]; + let q_packed = bitcast<u32>(vec2(q_0, q_1)); + for (var k = 0u; k < 4u; k++) { + let q_byte = get_byte(q_packed, k); + let q_hi = (f16((q_byte >> 4) & 0xF) - 8.0) * d; + let q_lo = (f16(q_byte & 0xF) - 8.0) * d; + let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k; + kv_shmem[row_offset + idx] = q_lo; + kv_shmem[row_offset + idx + 16u] = q_hi; + } + } + } + } +#elif defined(KV_Q8_0) + for (var elem_idx = local_id.x * NQ; elem_idx < KV_TILE * HEAD_DIM_V; elem_idx += WG_SIZE * NQ) { + let blck_idx = elem_idx / BLOCK_SIZE; + let block_offset = (elem_idx % BLOCK_SIZE) / WEIGHTS_PER_F16; + let v_row = blck_idx / BLOCKS_V; + let global_v_row = kv_tile + v_row; + let block_k = blck_idx % BLOCKS_V; + let row_offset = v_row * HEAD_DIM_V; + + if (global_v_row < params.seq_len_kv) { + let global_block_idx = v_head_offset + global_v_row * params.stride_v1 + block_k; + let base_idx = global_block_idx * F16_PER_BLOCK; + let d = V[base_idx]; // scale + for (var j = 0u; j < F16_PER_THREAD; j += 2) { + let q_0 = V[base_idx + 1u + block_offset + j]; + let q_1 = V[base_idx + 1u + block_offset + j + 1]; + let q_packed = bitcast<u32>(vec2(q_0, q_1)); + for (var k = 0u; k < 4u; k++) { + let q_byte = get_byte_i32(q_packed, k); + let q_val = f16(q_byte) * d; + let idx = block_k * BLOCK_SIZE + block_offset * 2u + j * 2u + k; + kv_shmem[row_offset + idx] = q_val; + } + } + } + } +#elif defined(KV_DIRECT) + // Direct global loads for KV +#else + for (var elem_idx = local_id.x; elem_idx < KV_TILE * HEAD_DIM_V; elem_idx += WG_SIZE) { + let v_row = elem_idx / HEAD_DIM_V; + let v_col = elem_idx % HEAD_DIM_V; + let global_v_row = kv_tile + v_row; + let global_v_row_offset = v_head_offset + global_v_row * params.stride_v1; + kv_shmem[elem_idx] = f16(select( + 0.0, + V[global_v_row_offset + v_col], + global_v_row < params.seq_len_kv && v_col < HEAD_DIM_V)); + } +#endif + + workgroupBarrier(); + + // we have P (Q_TILE x KV_TILE) in inter_shmem and V (KV_TILE x head_dim_v) in kv_shmem + // we want to compute O += P * V across the full KV tile + for (var head_dim_block = subgroup_id * SG_MAT_N; + head_dim_block < HEAD_DIM_V; + head_dim_block += num_subgroups * SG_MAT_N) { + // load O submatrix from shared memory + var o_sg_mat: subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_result<f16, SG_MAT_M, SG_MAT_N>>( + &o_shmem, + head_dim_block, + false, + HEAD_DIM_V + ); + for (var kv_block = 0u; kv_block < KV_BLOCKS; kv_block++) { + let p_offset = kv_block * SG_MAT_N; + var p_sg_mat: subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K> = subgroupMatrixLoad<subgroup_matrix_left<f16, SG_MAT_M, SG_MAT_K>>( + &inter_shmem, + p_offset, + false, + KV_TILE + ); + + // load V submatrix from global or shared memory +#ifdef KV_DIRECT + let v_block_row = kv_tile + kv_block * SG_MAT_N; + let v_global_offset = v_head_offset + v_block_row * params.stride_v1 + head_dim_block; + var v_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>( + &V, + v_global_offset, + false, + params.stride_v1 + ); +#else + let v_block_offset = kv_block * SG_MAT_N * HEAD_DIM_V; + var v_sg_mat: subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N> = subgroupMatrixLoad<subgroup_matrix_right<f16, SG_MAT_K, SG_MAT_N>>( + &kv_shmem, + v_block_offset + head_dim_block, + false, + HEAD_DIM_V + ); +#endif + // O += P * V + o_sg_mat = subgroupMatrixMultiplyAccumulate(p_sg_mat, v_sg_mat, o_sg_mat); + } + // store O back to shared memory + subgroupMatrixStore(&o_shmem, head_dim_block, o_sg_mat, false, HEAD_DIM_V); + } + workgroupBarrier(); + } + +#ifdef SINKS + // add sinks (applied once after processing all KV tiles) + for (var q_tile_row = subgroup_id; + q_tile_row < Q_TILE; + q_tile_row += num_subgroups) { + // no need to process rows beyond seq_len_q + let global_q_row = q_row_start + q_tile_row; + if (global_q_row >= params.seq_len_q) { + break; + } + + var prev_max = row_max_shmem[q_tile_row]; + + // for non-sink threads, exp(FLOAT_MIN) effectively zeroes out their contribution to the sum + let sink_val = select(FLOAT_MIN, sinks[params.offset_sinks + head_idx], sg_inv_id == 0); + let new_max = subgroupMax(max(prev_max, sink_val)); + let max_exp = exp(prev_max - new_max); + let sink_exp = exp(sink_val - new_max); + + let sink_exp_sum = subgroupAdd(sink_exp); + + if (sg_inv_id == 0) { + exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * max_exp + sink_exp_sum; + } + + for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) { + let idx = q_tile_row * HEAD_DIM_V + elem_idx; + let val = f32(o_shmem[idx]) * max_exp; + o_shmem[idx] = f16(val); + } + } + workgroupBarrier(); +#endif + for (var q_tile_row = subgroup_id; + q_tile_row < Q_TILE; + q_tile_row += num_subgroups) { + + let global_q_row = q_row_start + q_tile_row; + if (global_q_row >= params.seq_len_q) { break; } + + let exp_sum = exp_sum_shmem[q_tile_row]; + let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0.0); + + let row_base: u32 = dst_global_offset + q_tile_row * dst2_stride; + + for (var elem_base = sg_inv_id * 4u; + elem_base < HEAD_DIM_V; + elem_base += subgroup_size * 4u) { + + let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u); + let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u); + let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u); + let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u); + + let v = vec4<f32>( + f32(o_shmem[i0]) * scale, + f32(o_shmem[i1]) * scale, + f32(o_shmem[i2]) * scale, + f32(o_shmem[i3]) * scale + ); + + let dst_vec_index: u32 = (row_base + elem_base) >> 2u; + dst[dst_vec_index] = v; + } + } +} diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl new file mode 100644 index 0000000..f80ce1f --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl @@ -0,0 +1,874 @@ +#define(VARIANTS) + +[ + { + "SHADER_SUFFIX": "f32_vec", + "REPLS": { + "TYPE" : "vec4<f32>", + "DST_TYPE": "vec4<f32>", + "BLOCK_SIZE": 4 + }, + "DECLS": ["F32_VEC"] + }, + { + "REPLS": { + "TYPE" : "f32", + "DST_TYPE": "f32", + "BLOCK_SIZE": 1 + }, + "DECLS": ["F32"] + }, + { + "REPLS": { + "TYPE" : "f16", + "DST_TYPE": "f32", + "BLOCK_SIZE": 1 + }, + "DECLS": ["F16"] + }, + { + "REPLS": { + "TYPE" : "i32", + "DST_TYPE": "i32", + "BLOCK_SIZE": 1 + }, + "DECLS": ["I32"] + }, + { + "REPLS": { + "TYPE" : "q4_0", + "DST_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q4_0_T", "Q4_0"] + }, + { + "REPLS": { + "TYPE" : "q4_1", + "DST_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q4_1_T", "Q4_1"] + }, + { + "REPLS": { + "TYPE" : "q5_0", + "DST_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q5_0_T", "Q5_0"] + }, + { + "REPLS": { + "TYPE" : "q5_1", + "DST_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q5_1_T", "Q5_1"] + }, + { + "REPLS": { + "TYPE" : "q8_0", + "DST_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q8_0_T", "Q8_0"] + }, + { + "REPLS": { + "TYPE" : "q2_k", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "Q2_K_T", "Q2_K"] + }, + { + "REPLS": { + "TYPE" : "q3_k", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "Q3_K_T", "Q3_K"] + }, + { + "REPLS": { + "TYPE" : "q4_k", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q4_K_T", "Q4_K"] + }, + { + "REPLS": { + "TYPE" : "q5_k", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q5_K_T", "Q5_K"] + }, + { + "REPLS": { + "TYPE" : "q6_k", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "Q6_K_T", "Q6_K"] + }, + { + "REPLS": { + "TYPE" : "iq2_xxs", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XXS_GRID", "IQ2_XXS_T", "IQ2_XXS"] + }, + { + "REPLS": { + "TYPE" : "iq2_xs", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XS_GRID", "IQ2_XS_T", "IQ2_XS"] + }, + { + "REPLS": { + "TYPE": "iq2_s", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_S_GRID", "IQ2_S_T", "IQ2_S"] + }, + { + "REPLS": { + "TYPE": "iq3_xxs", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_XSS_GRID", "IQ3_XSS_T", "IQ3_XSS"] + }, + { + "REPLS": { + "TYPE": "iq3_s", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_S_GRID", "IQ3_S_T", "IQ3_S"] + }, + { + "REPLS": { + "TYPE": "iq1_s", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ1_GRID", "IQ1_S_T", "IQ1_S"] + }, + { + "REPLS": { + "TYPE": "iq1_m", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ1_GRID", "IQ1_M_T", "IQ1_M"] + }, + { + "REPLS": { + "TYPE": "iq4_nl", + "DST_TYPE": "f32", + "BLOCK_SIZE": 32, + }, + "DECLS": ["BYTE_HELPERS", "IQ4_GRID", "IQ4_NL_T", "IQ4_NL"] + }, + { + "REPLS": { + "TYPE": "iq4_xs", + "DST_TYPE": "f32", + "BLOCK_SIZE": 256, + }, + "DECLS": ["BYTE_HELPERS", "IQ4_GRID", "IQ4_XS_T", "IQ4_XS"] + } +] + +#end(VARIANTS) + +#define(DECLS) + +#decl(F32_VEC) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + dst[(dst_base / 4) + offset] = src[(src_base / 4) + offset]; +} +#enddecl(F32_VEC) + +#decl(F32) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + dst[dst_base + offset] = src[src_base + offset]; +} +#enddecl(F32) + +#decl(F16) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + dst[dst_base + offset] = f32(src[src_base + offset]); +} +#enddecl(F16) + +#decl(I32) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + dst[dst_base + offset] = src[src_base + offset]; +} +#enddecl(I32) + +#decl(Q4_0) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block_q4_0 = src[src_base + offset]; + let d = f32(block_q4_0.d); + for (var j: u32 = 0; j < 4; j++) { + let q_packed = bitcast<u32>(vec2(block_q4_0.qs[2 * j], block_q4_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0f) * d; + let q_lo = (f32(q_byte & 0xF) - 8.0f) * d; + let dst_offset = dst_base + offset * 32 + j * 4 + k; + dst[dst_offset] = q_lo; + dst[dst_offset + 16] = q_hi; + } + } +} +#enddecl(Q4_0) + +#decl(Q4_1) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block_q4_1 = src[src_base + offset]; + let d = f32(block_q4_1.d); + let m = f32(block_q4_1.m); + for (var j: u32 = 0; j < 4; j++) { + let q_packed = block_q4_1.qs[j]; + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let q_hi = f32((q_byte >> 4) & 0xF) * d + m; + let q_lo = f32(q_byte & 0xF) * d + m; + let dst_offset = dst_base + offset * 32 + j * 4 + k; + dst[dst_offset] = q_lo; + dst[dst_offset + 16] = q_hi; + } + } +} +#enddecl(Q4_1) + +#decl(Q5_0) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block_q5_0 = src[src_base + offset]; + let d = f32(block_q5_0.d); + let qh_packed = bitcast<u32>(vec2(block_q5_0.qh[0], block_q5_0.qh[1])); + for (var j: u32 = 0; j < 4; j++) { + let q_packed = bitcast<u32>(vec2(block_q5_0.qs[2 * j], block_q5_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let qh_hi = (qh_packed >> (j * 4 + k + 12)) & 0x10; + let q_hi = (f32(((q_byte >> 4) & 0xF) | qh_hi) - 16.0) * d; + let qh_lo = ((qh_packed >> (j * 4 + k)) << 4) & 0x10; + let q_lo = (f32((q_byte & 0xF) | qh_lo) - 16.0) * d; + let dst_offset = dst_base + offset * 32 + j * 4 + k; + dst[dst_offset] = q_lo; + dst[dst_offset + 16] = q_hi; + } + } +} + +#enddecl(Q5_0) + +#decl(Q5_1) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block_q5_1 = src[src_base + offset]; + let d = f32(block_q5_1.d); + let m = f32(block_q5_1.m); + for (var j: u32 = 0; j < 4; j++) { + let q_packed = block_q5_1.qs[j]; + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let qh_hi = (block_q5_1.qh >> (j * 4 + k + 12)) & 0x10; + let q_hi = f32(((q_byte >> 4) & 0xF) | qh_hi) * d + m; + let qh_lo = ((block_q5_1.qh >> (j * 4 + k)) << 4) & 0x10; + let q_lo = f32((q_byte & 0xF) | qh_lo) * d + m; + let dst_offset = dst_base + offset * 32 + j * 4 + k; + dst[dst_offset] = q_lo; + dst[dst_offset + 16] = q_hi; + } + } +} +#enddecl(Q5_1) + +#decl(Q8_0) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block_q8_0 = src[src_base + offset]; + let d = f32(block_q8_0.d); + for (var j: u32 = 0; j < 8; j++) { + let q_packed = bitcast<u32>(vec2(block_q8_0.qs[2 * j], block_q8_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte_i32(q_packed, k); + let q_val = f32(q_byte) * d; + let dst_offset = dst_base + offset * 32 + j * 4 + k; + dst[dst_offset] = q_val; + } + } +} +#enddecl(Q8_0) + +#decl(Q2_K) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var dst_i = dst_base + offset * 256; + var is: u32 = 0; + // 2 halves of the block (128 elements each) + for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) { + // 4 groups (each group has 2 blocks of 16 elements) + for (var shift: u32 = 0; shift < 8; shift += 2) { + // 2 blocks + for (var k: u32 = 0; k < 32; k += 16) { + let sc = get_byte(block.scales[is / 4], is % 4); + is++; + let dl = d * f32(sc & 0xF); + let ml = m * f32(sc >> 4); + for (var l: u32 = 0u; l < 16; l++) { + let q_idx = q_b_idx + k + l; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); + let qs_val = (q_byte >> shift) & 3; + dst[dst_i] = (f32(qs_val) * dl - ml); + dst_i++; + } + } + } + } +} +#enddecl(Q2_K) + +#decl(Q3_K) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + + // extract 6-bit scales, which consist of 4-bits from first 8 bytes of scale, + // and 2-bits from the last 4 bytes + let kmask1: u32 = 0x03030303; + let kmask2: u32 = 0x0f0f0f0f; + var scale_vals: array<u32, 4>; + for (var i: u32 = 0; i < 4; i++) { + scale_vals[i] = bitcast<u32>(vec2(block.scales[2 * i], block.scales[2 * i + 1])); + } + var tmp: u32 = scale_vals[2]; + scale_vals[2] = ((scale_vals[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + scale_vals[3] = ((scale_vals[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + scale_vals[0] = (scale_vals[0] & kmask2) | ((tmp & kmask1) << 4); + scale_vals[1] = (scale_vals[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + // convert arrays of f16 -> u32 + var hmask_vals: array<u32, 8>; + for (var i: u32 = 0; i < 8; i++) { + hmask_vals[i] = bitcast<u32>(vec2(block.hmask[2 * i], block.hmask[2 * i + 1])); + } + var qs_vals: array<u32, 16>; + for (var i: u32 = 0; i < 16; i++) { + qs_vals[i] = bitcast<u32>(vec2(block.qs[2 * i], block.qs[2 * i + 1])); + } + + var dst_i = dst_base + offset * 256; + var is: u32 = 0; + var m: u32 = 1; + // 2 halves of the block (128 elements each) + for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) { + // 4 groups (each group has 2 blocks of 16 elements) + for (var shift: u32 = 0; shift < 8; shift += 2) { + // 2 blocks + for (var k: u32 = 0; k < 32; k += 16) { + let sc = get_byte(scale_vals[is / 4], is % 4); + is++; + let dl = d * (f32(sc) - 32.0); + for (var l: u32 = 0u; l < 16u; l++) { + let q_idx = q_b_idx + k + l; + let hm_idx = k + l; + let q_byte = get_byte(qs_vals[q_idx / 4], q_idx % 4); + let hmask_byte = get_byte(hmask_vals[hm_idx / 4], hm_idx % 4); + let hm = select(4.0, 0.0, (hmask_byte & m) != 0); + let qs_val = (q_byte >> shift) & 3; + dst[dst_i] = (f32(qs_val) - hm) * dl; + dst_i++; + } + } + m <<= 1; + } + } +} +#enddecl(Q3_K) + +#decl(Q4_K) +// 8 blocks of 32 elements each +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var dst_i = dst_base + offset * 256; + var is: u32 = 0; + // 2 blocks each iteration + for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) { + for (var shift: u32 = 0; shift < 8; shift += 4) { + let scale_min = get_scale_min(is, block.scales); + is++; + let dl = d * scale_min.x; + let ml = m * scale_min.y; + for (var l: u32 = 0; l < 32; l++) { + let q_idx = q_b_idx + l; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); + let qs_val = (q_byte >> shift) & 0xF; + dst[dst_i] = (f32(qs_val) * dl - ml); + dst_i++; + } + } + } +} +#enddecl(Q4_K) + +#decl(Q5_K) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var dst_i = dst_base + offset * 256; + var is: u32 = 0; + var u: u32 = 1; + // 2 blocks each iteration + for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) { + for (var shift: u32 = 0; shift < 8; shift += 4) { + let scale_min = get_scale_min(is, block.scales); + is++; + let dl = d * scale_min.x; + let ml = m * scale_min.y; + for (var l: u32 = 0; l < 32; l++) { + let q_idx = q_b_idx + l; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); + let qh_byte = get_byte(block.qh[l / 4], l % 4); + let qs_val = (q_byte >> shift) & 0xF; + let qh_val = select(0.0, 16.0, (qh_byte & u) != 0); + dst[dst_i] = (f32(qs_val) + qh_val) * dl - ml; + dst_i++; + } + u <<= 1; + } + } +} +#enddecl(Q5_K) + +#decl(Q6_K) +// 16 blocks of 16 elements each +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + + // convert arrays of f16 -> u32 + var ql_vals: array<u32, 32>; + for (var i: u32 = 0; i < 32; i++) { + ql_vals[i] = bitcast<u32>(vec2(block.ql[2 * i], block.ql[2 * i + 1])); + } + var qh_vals: array<u32, 16>; + for (var i: u32 = 0; i < 16; i++) { + qh_vals[i] = bitcast<u32>(vec2(block.qh[2 * i], block.qh[2 * i + 1])); + } + var scale_vals: array<u32, 4>; + for (var i: u32 = 0; i < 4; i++) { + scale_vals[i] = bitcast<u32>(vec2(block.scales[2 * i], block.scales[2 * i + 1])); + } + + var dst_i = dst_base + offset * 256; + var qh_b_idx: u32 = 0; + var sc_b_idx: u32 = 0; + for (var ql_b_idx: u32 = 0; ql_b_idx < 128; ql_b_idx += 64) { + for (var l: u32 = 0; l < 32; l++) { + let ql13_b = get_byte(ql_vals[(ql_b_idx + l) / 4], (ql_b_idx + l) % 4); + let ql24_b = get_byte(ql_vals[(ql_b_idx + l + 32) / 4], (ql_b_idx + l + 32) % 4); + let qh_b = get_byte(qh_vals[(qh_b_idx + l) / 4], (qh_b_idx + l) % 4); + + let q1 = f32((ql13_b & 0xF) | ((qh_b & 3) << 4)) - 32.0; + let q2 = f32((ql24_b & 0xF) | (((qh_b >> 2) & 3) << 4)) - 32.0; + let q3 = f32((ql13_b >> 4) | (((qh_b >> 4) & 3) << 4)) - 32.0; + let q4 = f32((ql24_b >> 4) | (((qh_b >> 6) & 3) << 4)) - 32.0; + + let is = l/16; + let is1 = sc_b_idx + is; + let sc1 = get_byte_i32(scale_vals[is1 / 4], is1 % 4); + let is2 = sc_b_idx + is + 2; + let sc2 = get_byte_i32(scale_vals[is2 / 4], is2 % 4); + let is3 = sc_b_idx + is + 4; + let sc3 = get_byte_i32(scale_vals[is3 / 4], is3 % 4); + let is4 = sc_b_idx + is + 6; + let sc4 = get_byte_i32(scale_vals[is4 / 4], is4 % 4); + + dst[dst_i + l] = (q1 * f32(sc1)) * d; + dst[dst_i + l + 32] = (q2 * f32(sc2)) * d; + dst[dst_i + l + 64] = (q3 * f32(sc3)) * d; + dst[dst_i + l + 96] = (q4 * f32(sc4)) * d; + } + dst_i += 128; + qh_b_idx += 32; + sc_b_idx += 8; + } +} + +#enddecl(Q6_K) + +#decl(IQ2_XXS) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + var dst_i = dst_base + offset * 256; + for (var ib: u32 = 0; ib < 32; ib += 4) { + let aux0 = bitcast<u32>(vec2(block.qs[ib], block.qs[ib + 1])); + let aux1 = bitcast<u32>(vec2(block.qs[ib + 2], block.qs[ib + 3])); + let db = d * (0.5 + f32(aux1 >> 28)) * 0.25; + for (var l: u32 = 0; l < 4; l++) { + let ig = get_byte(aux0, l) * 8; + let is = (aux1 >> (7 * l)) & 127; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2xxs_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + dst[dst_i] = db * f32(g) * m; + dst_i++; + } + } + } +} +#enddecl(IQ2_XXS) + +#decl(IQ2_XS) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + var dst_i = dst_base + offset * 256; + var scale_vals = array<u32, 2>( + bitcast<u32>(vec2(block.scales[0], block.scales[1])), + bitcast<u32>(vec2(block.scales[2], block.scales[3])) + ); + for (var ib: u32 = 0; ib < 32; ib += 4) { + let s = get_byte(scale_vals[ib / 16], (ib % 16) / 4); + let db = array<f32, 2>( + d * (0.5 + f32(s & 0xF)) * 0.25, + d * (0.5 + f32(s >> 4)) * 0.25 + ); + for (var l: u32 = 0; l < 4; l++) { + let qs_val = bitcast<u32>(vec2(block.qs[ib + l], 0.0)); + let ig = (qs_val & 511) * 8; + let is = qs_val >> 9; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + let dl = db[l/2]; + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2xs_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + dst[dst_i] = dl * f32(g) * m; + dst_i++; + } + } + } +} +#enddecl(IQ2_XS) + +#decl(IQ2_S) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + var dst_i = dst_base + offset * 256; + var qs_vals : array<u32, 16>; + for (var i: u32 = 0; i < 16; i++) { + qs_vals[i] = bitcast<u32>(vec2(block.qs[i * 2], block.qs[i * 2 + 1])); + } + var qh_vals = array<u32, 2>( + bitcast<u32>(vec2(block.qh[0], block.qh[1])), + bitcast<u32>(vec2(block.qh[2], block.qh[3])) + ); + var scale_vals = array<u32, 2>( + bitcast<u32>(vec2(block.scales[0], block.scales[1])), + bitcast<u32>(vec2(block.scales[2], block.scales[3])) + ); + for (var ib: u32 = 0; ib < 8; ib ++) { + let s = get_byte(scale_vals[ib / 4], ib % 4); + let db = array<f32, 2>( + d * (0.5 + f32(s & 0xF)) * 0.25, + d * (0.5 + f32(s >> 4)) * 0.25 + ); + let qs_w = qs_vals[ib]; + for (var l: u32 = 0; l < 4; l++) { + let qh_b = (get_byte(qh_vals[ib / 4], ib % 4) << (8 - 2 * l)) & 0x300; + let ig = (get_byte(qs_w, l) | qh_b) * 8; + let signs = get_byte(qs_vals[ib + 8], l); + let dl = db[l/2]; + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2s_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + dst[dst_i] = dl * f32(g) * m; + dst_i++; + } + } + } +} + +#enddecl(IQ2_S) + +#decl(IQ3_XSS) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + var dst_i = dst_base + offset * 256; + for (var ib: u32 = 0; ib < 16; ib += 2) { + let sc_sign = bitcast<u32>(vec2(block.qs[ib + 32], block.qs[ib + 33])); + let db = d * (0.5 + f32(sc_sign >> 28)) * 0.5; + for (var l: u32 = 0; l < 4; l++) { + let is = (sc_sign >> (7 * l)) & 127; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + let ig_val = bitcast<u32>(vec2(block.qs[ib * 2 + l], 0.0)); + let ig1 = get_byte(ig_val, 0); + let ig2 = get_byte(ig_val, 1); + for (var j: u32 = 0; j < 4; j++) { + let g1 = get_byte(iq3xxs_grid[ig1], j); + let g2 = get_byte(iq3xxs_grid[ig2], j); + let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0); + let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0); + dst[dst_i] = db * f32(g1) * m1; + dst[dst_i + 4] = db * f32(g2) * m2; + dst_i++; + } + dst_i += 4; + } + } +} +#enddecl(IQ3_XSS) + +#decl(IQ3_S) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + var dst_i = dst_base + offset * 256; + var qh_vals = array<u32, 2>( + bitcast<u32>(vec2(block.qh[0], block.qh[1])), + bitcast<u32>(vec2(block.qh[2], block.qh[3])) + ); + var sign_vals: array<u32, 8>; + for (var i: u32 = 0; i < 8; i++) { + sign_vals[i] = bitcast<u32>(vec2(block.signs[i * 2], block.signs[i * 2 + 1])); + } + var scale_vals = bitcast<u32>(vec2(block.scales[0], block.scales[1])); + for (var ib: u32 = 0; ib < 4; ib++) { + let s = get_byte(scale_vals, ib); + let db = array<f32, 2>( + d * (1.0 + 2.0 * f32(s & 0xF)), + d * (1.0 + 2.0 * f32(s >> 4)) + ); + for (var k: u32 = 0; k < 2; k++) { + let dl = db[k]; + let qh_byte = get_byte(qh_vals[ib / 2], (ib % 2) * 2 + k); + let sign_w = sign_vals[ib * 2 + k]; + for (var l: u32 = 0; l < 4; l++) { + let signs = get_byte(sign_w, l); + let ig_val = bitcast<u32>(vec2(block.qs[ib * 8 + k * 4 + l], 0.0)); + let ig1 = get_byte(ig_val, 0) | ((qh_byte << ((8 - (2 * l)))) & 256); + let ig2 = get_byte(ig_val, 1) | ((qh_byte << ((7 - (2 * l)))) & 256); + for (var j: u32 = 0; j < 4; j++) { + let g1 = get_byte(iq3s_grid[ig1], j); + let g2 = get_byte(iq3s_grid[ig2], j); + let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0); + let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0); + dst[dst_i] = dl * f32(g1) * m1; + dst[dst_i + 4] = dl * f32(g2) * m2; + dst_i++; + } + dst_i += 4; + } + } + } +} +#enddecl(IQ3_S) + +#decl(IQ1_S) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + var dst_i = dst_base + offset * 256; + for (var ib: u32 = 0; ib < 8; ib++) { + let qh = bitcast<u32>(vec2(block.qh[ib], 0.0)); + let dl = d * (2 * f32((qh >> 12) & 7) + 1); + let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000) != 0); + let qs_w = bitcast<u32>(vec2(block.qs[ib * 2], block.qs[ib * 2 + 1])); + for (var l: u32 = 0; l < 4; l++) { + let ig = (get_byte(qs_w, l) | (((qh >> (3 * l)) & 7) << 8)) * 8; + for (var j: u32 = 0; j < 8; j++) { + let gw = iq1_grid[(ig + j) / 16]; + let g = (gw >> (((ig + j) % 16) * 2)) & 3; + let gs = bitcast<i32>(g << 30) >> 30; + dst[dst_i] = dl * (f32(gs) + delta); + dst_i++; + } + } + } +} + +#enddecl(IQ1_S) + +#decl(IQ1_M) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + + let scale = ((block.scales[0] >> 12) & 0xF) | ((block.scales[0] >> 24) & 0x00F0) | ((block.scales[1] >> 4) & 0x0F00) | ((block.scales[1] >> 16) & 0xF000); + let d = f32(bitcast<vec2<f16>>(scale).x); + var dst_i = dst_base + offset * 256; + for (var ib: u32 = 0; ib < 8; ib++) { + let sw = (block.scales[ib / 4] >> (16 * ((ib / 2) % 2))) & 0xFFFF; + let s1 : u32 = (sw >> (6 * (ib % 2))) & 0x7; + let s2 : u32 = (sw >> (6 * (ib % 2) + 3)) & 0x7; + var dl = array<f32, 2>( + d * f32(2 * s1 + 1), + d * f32(2 * s2 + 1) + ); + + let qh = block.qh[ib / 2] >> (16 * (ib % 2)); + var idx = array<u32, 4>( + get_byte(block.qs[ib], 0) | ((qh << 8) & 0x700), + get_byte(block.qs[ib], 1) | ((qh << 4) & 0x700), + get_byte(block.qs[ib], 2) | ((qh) & 0x700), + get_byte(block.qs[ib], 3) | ((qh >> 4) & 0x700) + ); + var delta = array<f32, 4>( + select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x08) != 0), + select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x80) != 0), + select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x08) != 0), + select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x80) != 0) + ); + for (var l: u32 = 0; l < 4; l++) { + let ig = idx[l] * 8; + for (var j: u32 = 0; j < 8; j++) { + let gw = iq1_grid[(ig + j) / 16]; + let g = (gw >> (((ig + j) % 16) * 2)) & 3; + let gs = bitcast<i32>(g << 30) >> 30; + dst[dst_i] = dl[l/2] * (f32(gs) + delta[l]); + dst_i++; + } + } + } +} + +#enddecl(IQ1_M) + +#decl(IQ4_NL) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + var dst_i = dst_base + offset * 32; + var qs: array<u32, 4>; + for (var i: u32 = 0; i < 4; i++) { + qs[i] = bitcast<u32>(vec2(block.qs[i * 2], block.qs[i * 2 + 1])); + } + for (var j: u32 = 0; j < 16; j++) { + let qsb = get_byte(qs[j / 4], j % 4); + dst[dst_i] = d * f32(kvalues_iq4nl[qsb & 0xF]); + dst[dst_i + 16] = d * f32(kvalues_iq4nl[qsb >> 4]); + dst_i++; + } +} +#enddecl(IQ4_NL) + +#decl(IQ4_XS) +fn copy_elements(src_base: u32, dst_base: u32, offset: u32) { + let block = src[src_base + offset]; + let d = f32(block.d); + let scales_h = bitcast<u32>(vec2(block.scales_h, 0.0)); + var dst_i = dst_base + offset * 256; + for (var ib: u32 = 0; ib < 8; ib++) { + let ls = ((get_byte(block.scales_l, ib / 2) >> (4 * (ib % 2))) & 0xF) | (((scales_h >> (2 * ib)) & 3) << 4); + let dl = d * (f32(ls) - 32.0); + for (var j: u32 = 0; j < 16; j++) { + let iqs = ib * 16 + j; + let qsb = get_byte(block.qs[iqs / 4], iqs % 4); + dst[dst_i] = dl * f32(kvalues_iq4nl[qsb & 0xF]); + dst[dst_i + 16] = dl * f32(kvalues_iq4nl[qsb >> 4]); + dst_i++; + } + dst_i += 16; + } +} +#enddecl(IQ4_XS) + +#end(DECLS) + +#define(SHADER) + +enable f16; + +DECLS + +@group(0) @binding(0) +var<storage, read_write> src: array<{{TYPE}}>; + +@group(0) @binding(1) +var<storage, read_write> idx: array<i32>; + +@group(0) @binding(2) +var<storage, read_write> dst: array<{{DST_TYPE}}>; + +struct Params { + offset_src: u32, // in elements + offset_idx: u32, // in elements + offset_dst: u32, // in elements + + // Strides (in elements) + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + stride_idx0: u32, + stride_idx1: u32, + stride_idx2: u32, + + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + // Shape of dst + ne0: u32, + n_rows: u32, + ne2: u32, + ne3: u32, + + // Shape of idx + idx1: u32, + idx2: u32, +}; + +@group(0) @binding(3) +var<uniform> params: Params; + +override wg_size: u32; +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3<u32>) { + if (gid.x >= params.n_rows * params.ne2 * params.ne3) { + return; + } + var i = gid.x; + let i_dst3 = i / (params.ne2 * params.n_rows); + + i = i % (params.ne2 * params.n_rows); + let i_dst2 = i / params.n_rows; + let i_dst1 = i % params.n_rows; + + let i_idx2 = i_dst3 % params.idx2; + let i_idx1 = i_dst2 % params.idx1; + let i_idx0 = i_dst1; + + let i_idx = params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2; + + let idx_val = u32(idx[i_idx]); + + let i_src_row = params.offset_src + idx_val * params.stride_src1 + i_dst2 * params.stride_src2 + i_dst3 * params.stride_src3; + let i_dst_row = params.offset_dst + i_dst1 * params.stride_dst1 + i_dst2 * params.stride_dst2 + i_dst3 * params.stride_dst3; + + for (var i: u32 = 0; i < params.ne0/{{BLOCK_SIZE}}; i++) { + copy_elements(i_src_row, i_dst_row, i); + } +} + +#end(SHADER) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl new file mode 100644 index 0000000..03fcd54 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl @@ -0,0 +1,323 @@ +#define(VARIANTS) + +[ + { + "SHADER_NAME": "reglu_f32", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["NO_SPLIT", "REGLU"] + }, + { + "SHADER_NAME": "reglu_f32_split", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["SPLIT", "REGLU"] + }, + { + "SHADER_NAME": "reglu_f16", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["NO_SPLIT", "REGLU"] + }, + { + "SHADER_NAME": "reglu_f16_split", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["SPLIT", "REGLU"] + }, + { + "SHADER_NAME": "geglu_f32", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["NO_SPLIT", "GEGLU"] + }, + { + "SHADER_NAME": "geglu_f32_split", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["SPLIT", "GEGLU"] + }, + { + "SHADER_NAME": "geglu_f16", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["NO_SPLIT", "GEGLU"] + }, + { + "SHADER_NAME": "geglu_f16_split", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["SPLIT", "GEGLU"] + }, + { + "SHADER_NAME": "swiglu_f32", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["NO_SPLIT", "SWIGLU"] + }, + { + "SHADER_NAME": "swiglu_f32_split", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["SPLIT", "SWIGLU"] + }, + { + "SHADER_NAME": "swiglu_f16", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["NO_SPLIT", "SWIGLU"] + }, + { + "SHADER_NAME": "swiglu_f16_split", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["SPLIT", "SWIGLU"] + }, + { + "SHADER_NAME": "swiglu_oai_f32", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["NO_SPLIT", "SWIGLU_OAI"] + }, + { + "SHADER_NAME": "swiglu_oai_f32_split", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["SPLIT", "SWIGLU_OAI"] + }, + { + "SHADER_NAME": "geglu_erf_f32", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["NO_SPLIT", "GEGLU_ERF"] + }, + { + "SHADER_NAME": "geglu_erf_f32_split", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["SPLIT", "GEGLU_ERF"] + }, + { + "SHADER_NAME": "geglu_erf_f16", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["NO_SPLIT", "GEGLU_ERF"] + }, + { + "SHADER_NAME": "geglu_erf_f16_split", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["SPLIT", "GEGLU_ERF"] + }, + { + "SHADER_NAME": "geglu_quick_f32", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["NO_SPLIT", "GEGLU_QUICK"] + }, + { + "SHADER_NAME": "geglu_quick_f32_split", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["SPLIT", "GEGLU_QUICK"] + }, + { + "SHADER_NAME": "geglu_quick_f16", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["NO_SPLIT", "GEGLU_QUICK"] + }, + { + "SHADER_NAME": "geglu_quick_f16_split", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["SPLIT", "GEGLU_QUICK"] + }, +] + +#end(VARIANTS) + +#define(DECLS) + +#decl(REGLU) +fn op(a: {{TYPE}}, b: {{TYPE}}) -> {{TYPE}} { + return max(a, 0) * b; +} +#enddecl(REGLU) + +#decl(GEGLU) +const SQRT_2_OVER_PI: {{TYPE}} = 0.79788456080286535587989211986876; +const GELU_COEF_A: {{TYPE}} = 0.044715; + +fn op(a: {{TYPE}}, b: {{TYPE}}) -> {{TYPE}} { + let val = SQRT_2_OVER_PI * a * (1.0 + GELU_COEF_A * a * a); + return 0.5 * a * (2.0 - 2.0 / (exp(2 * val) + 1)) * b; +} +#enddecl(GEGLU) + +#decl(SWIGLU) +fn op(a: {{TYPE}}, b: {{TYPE}}) -> {{TYPE}} { + return a / (1.0 + exp(-a)) * b; +} +#enddecl(SWIGLU) + +#decl(SWIGLU_OAI) +fn op(a: f32, b: f32) -> f32 { + let xi = min(a, params.limit); + let gi = max(min(b, params.limit), -params.limit); + var out_glu = xi / (1.0 + exp(-xi * params.alpha)); + out_glu = out_glu * (1.0 + gi); + return out_glu; +} +#enddecl(SWIGLU_OAI) + +#decl(GEGLU_ERF) +const p_erf: {{TYPE}} = 0.3275911; +const a1_erf: {{TYPE}} = 0.254829592; +const a2_erf: {{TYPE}} = -0.284496736; +const a3_erf: {{TYPE}} = 1.421413741; +const a4_erf: {{TYPE}} = -1.453152027; +const a5_erf: {{TYPE}} = 1.061405429; +const SQRT_2_INV: {{TYPE}} = 0.7071067811865476; + +fn op(a: {{TYPE}}, b: {{TYPE}}) -> {{TYPE}} { + let a_div_sqr2 = a * SQRT_2_INV; + let sign_x = sign(a_div_sqr2); + let x = abs(a_div_sqr2); + let t = 1.0 / (1.0 + p_erf * x); + let y = 1.0 - (((((a5_erf * t + a4_erf) * t + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x)); + let erf_approx = sign_x * y; + return 0.5 * a * (1.0 + erf_approx) * b; +} +#enddecl(GEGLU_ERF) + +#decl(GEGLU_QUICK) +const GELU_QUICK_COEF: {{TYPE}} = -1.702; + +fn op(a: {{TYPE}}, b: {{TYPE}}) -> {{TYPE}} { + return a * (1.0 / (1.0 + exp(GELU_QUICK_COEF * a))) * b; +} +#enddecl(GEGLU_QUICK) + +#decl(NO_SPLIT) +@group(0) @binding(1) +var<storage, read_write> dst: array<{{TYPE}}>; + +@group(0) @binding(2) +var<uniform> params: Params; + +fn a_value(base: u32) -> {{TYPE}} { + let offset: u32 = select(0, params.ne0, params.swapped != 0); + return src0[base + offset]; +} + +fn b_value(base: u32) -> {{TYPE}} { + let offset: u32 = select(params.ne0, 0, params.swapped != 0); + return src0[base + offset]; +} +#enddecl(NO_SPLIT) + +#decl(SPLIT) +@group(0) @binding(1) +var<storage, read_write> src1: array<{{TYPE}}>; + +@group(0) @binding(2) +var<storage, read_write> dst: array<{{TYPE}}>; + +@group(0) @binding(3) +var<uniform> params: Params; + +fn a_value(base: u32) -> {{TYPE}} { + return src0[base]; +} + +fn b_value(base: u32) -> {{TYPE}} { + return src1[base]; +} +#enddecl(SPLIT) + +#end(DECLS) + +#define(SHADER) + +enable f16; + +struct Params { + offset_src0: u32, + offset_src1: u32, + offset_dst: u32, + + // Strides (in elements) + stride_src01: u32, + stride_src02: u32, + stride_src03: u32, + + stride_src11: u32, + stride_src12: u32, + stride_src13: u32, + + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + // shape of dst + ne: u32, + ne0: u32, + ne1: u32, + ne2: u32, + + swapped: u32, + alpha: f32, + limit: f32, +} + +@group(0) @binding(0) +var<storage, read_write> src0: array<{{TYPE}}>; + +DECLS + +override wg_size: u32; +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3<u32>) { + if (gid.x >= params.ne) { + return; + } + + var i = gid.x; + let i3 = i / (params.ne2 * params.ne1 * params.ne0); + i = i % (params.ne2 * params.ne1 * params.ne0); + let i2 = i / (params.ne1 * params.ne0); + i = i % (params.ne1 * params.ne0); + let i1 = i / params.ne0; + let i0 = i % params.ne0; + + let i_a = params.offset_src0 + i3 * params.stride_src03 + i2 * params.stride_src02 + i1 * params.stride_src01 + i0; + let i_b = params.offset_src1 + i3 * params.stride_src13 + i2 * params.stride_src12 + i1 * params.stride_src11 + i0; + let i_dst = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1 + i0; + + dst[i_dst] = op(a_value(i_a), b_value(i_b)); +} + +#end(SHADER) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl new file mode 100644 index 0000000..194d2d6 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl @@ -0,0 +1,40 @@ +@group(0) @binding(0) +var<storage, read_write> output_buffer: array<u32>; + +struct Params { + offset: u32, // in bytes + size: u32, // in bytes + value: u32, // 4 8-bit values, which are either repeating (memset_tensor) or may be separate (cleaning up unaligned set_tensor operations) +}; + +@group(0) @binding(1) +var<uniform> params: Params; + +override wg_size: u32; +override bytes_per_thread: u32; + +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3<u32>) { + let i = gid.x * bytes_per_thread; + let start = params.offset; + let end = params.offset + params.size; + + for (var j: u32 = 0u; j < bytes_per_thread; j += 4) { + let byte_index = start + i + j; + if (byte_index + 4 <= end) { + output_buffer[byte_index >> 2] = params.value; + } else { + // Handle tail (unaligned) + for (var k: u32 = 0; k < 4; k++) { + let idx = byte_index + k; + if (idx < end) { + let word_idx = idx >> 2; + let bit_offset = (idx & 3) * 8u; + let mask = ~(0xffu << bit_offset); + let existing = output_buffer[word_idx]; + output_buffer[word_idx] = (existing & mask) | (params.value & (0xffu << bit_offset)); + } + } + } + } +} diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl new file mode 100644 index 0000000..0f8e6e5 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -0,0 +1,907 @@ +#define(VARIANTS) + +[ + { + "REPLS": { + "SRC0_TYPE" : "f32", + "SRC1_TYPE" : "f32", + "BLOCK_SIZE" : 1 + }, + "DECLS" : ["FLOAT"] + }, + { + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f16", + "BLOCK_SIZE" : 1 + }, + "DECLS" : ["FLOAT"] + }, + { + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f32", + "BLOCK_SIZE" : 1 + }, + "DECLS" : ["FLOAT"] + }, + { + "REPLS": { + "SRC0_TYPE": "q4_0", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q4_0_T", "Q4_0"] + }, + { + "REPLS": { + "SRC0_TYPE": "q4_1", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q4_1_T", "Q4_1"] + }, + { + "REPLS": { + "SRC0_TYPE": "q5_0", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q5_0_T", "Q5_0"] + }, + { + "REPLS": { + "SRC0_TYPE": "q5_1", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q5_1_T", "Q5_1"] + }, + { + "REPLS": { + "SRC0_TYPE": "q8_0", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q8_0_T", "Q8_0"] + }, + { + "REPLS": { + "SRC0_TYPE": "q2_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "Q2_K_T", "Q2_K"] + }, + { + "REPLS": { + "SRC0_TYPE": "q3_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "Q3_K_T", "Q3_K"] + }, + { + "REPLS": { + "SRC0_TYPE": "q4_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q4_K_T", "Q4_K"] + }, + { + "REPLS": { + "SRC0_TYPE": "q5_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q5_K_T", "Q5_K"] + }, + { + "REPLS": { + "SRC0_TYPE": "q6_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "Q6_K_T", "Q6_K"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq2_xxs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XXS_GRID", "IQ2_XXS_T", "IQ2_XXS"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq2_xs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XS_GRID", "IQ2_XS_T", "IQ2_XS"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq2_s", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_S_GRID", "IQ2_S_T", "IQ2_S"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq3_xxs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_XSS_GRID", "IQ3_XSS_T", "IQ3_XSS"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq3_s", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_S_GRID", "IQ3_S_T", "IQ3_S"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq1_s", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ1_GRID", "IQ1_S_T", "IQ1_S"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq1_m", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ1_GRID", "IQ1_M_T", "IQ1_M"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq4_nl", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32, + }, + "DECLS": ["BYTE_HELPERS", "IQ4_GRID", "IQ4_NL_T", "IQ4_NL"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq4_xs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256, + }, + "DECLS": ["BYTE_HELPERS", "IQ4_GRID", "IQ4_XS_T", "IQ4_XS"] + } +] + +#end(VARIANTS) + +#define(DECLS) + +#decl(FLOAT) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + return f32(src0[src0_idx_base + offset]) * f32(src1[src1_idx_base + offset]); +} +#enddecl(FLOAT) + +#decl(Q4_0) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q4_0 = src0[src0_idx_base + offset]; + let d = f32(block_q4_0.d); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 4; j++) { + let q_packed = bitcast<u32>(vec2(block_q4_0.qs[2 * j], block_q4_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0f) * d; + let q_lo = (f32(q_byte & 0xF) - 8.0f) * d; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); + } + } + return sum; +} +#enddecl(Q4_0) + +#decl(Q4_1) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q4_1 = src0[src0_idx_base + offset]; + let d = f32(block_q4_1.d); + let m = f32(block_q4_1.m); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 4; j++) { + let q_packed = block_q4_1.qs[j]; + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let q_hi = f32((q_byte >> 4) & 0xF) * d + m; + let q_lo = f32(q_byte & 0xF) * d + m; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); + } + } + return sum; +} +#enddecl(Q4_1) + +#decl(Q5_0) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q5_0 = src0[src0_idx_base + offset]; + let d = f32(block_q5_0.d); + var sum: f32 = 0.0; + let qh_packed = bitcast<u32>(vec2(block_q5_0.qh[0], block_q5_0.qh[1])); + for (var j: u32 = 0; j < 4; j++) { + let q_packed = bitcast<u32>(vec2(block_q5_0.qs[2 * j], block_q5_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let qh_hi = (qh_packed >> (j * 4 + k + 12)) & 0x10; + let q_hi = (f32(((q_byte >> 4) & 0xF) | qh_hi) - 16.0) * d; + let qh_lo = ((qh_packed >> (j * 4 + k)) << 4) & 0x10; + let q_lo = (f32((q_byte & 0xF) | qh_lo) - 16.0) * d; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); + } + } + return sum; +} +#enddecl(Q5_0) + +#decl(Q5_1) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q5_1 = src0[src0_idx_base + offset]; + let d = f32(block_q5_1.d); + let m = f32(block_q5_1.m); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 4; j++) { + let q_packed = block_q5_1.qs[j]; + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let qh_hi = (block_q5_1.qh >> (j * 4 + k + 12)) & 0x10; + let q_hi = f32(((q_byte >> 4) & 0xF) | qh_hi) * d + m; + let qh_lo = ((block_q5_1.qh >> (j * 4 + k)) << 4) & 0x10; + let q_lo = f32((q_byte & 0xF) | qh_lo) * d + m; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); + } + } + return sum; +} +#enddecl(Q5_1) + +#decl(Q8_0) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q8_0 = src0[src0_idx_base + offset]; + let d = f32(block_q8_0.d); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 8; j++) { + let q_packed = bitcast<u32>(vec2(block_q8_0.qs[2 * j], block_q8_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte_i32(q_packed, k); + let q_val = f32(q_byte) * d; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_val * f32(src1[src1_offset]); + } + } + return sum; +} +#enddecl(Q8_0) + +#decl(Q8_1) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q8_1 = src0[src0_idx_base + offset]; + let d = f32(block_q8_1.d); + let m = f32(block_q8_1.m); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 8; j++) { + let q_packed = block_q8_1.qs[j]; + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte_i32(q_packed, k); + let q_val = f32(q_byte) * d + m; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_val * f32(src1[src1_offset]); + } + } + return sum; +} +#enddecl(Q8_1) + +#decl(Q2_K) +// 16 blocks of 16 elements each +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + // 2 halves of the block (128 elements each) + for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) { + // 4 groups (each group has 2 blocks of 16 elements) + for (var shift: u32 = 0; shift < 8; shift += 2) { + // 2 blocks + for (var k: u32 = 0; k < 32; k += 16) { + let sc = get_byte(block.scales[is / 4], is % 4); + is++; + let dl = d * f32(sc & 0xF); + let ml = m * f32(sc >> 4); + for (var l: u32 = 0u; l < 16; l++) { + let q_idx = q_b_idx + k + l; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); + let qs_val = (q_byte >> shift) & 3; + sum += (f32(qs_val) * dl - ml) * src1[src1_i]; + src1_i++; + } + } + } + } + return sum; +} + +#enddecl(Q2_K) + +#decl(Q3_K) +// 16 blocks of 16 elements each +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + + // extract 6-bit scales, which consist of 4-bits from first 8 bytes of scale, + // and 2-bits from the last 4 bytes + let kmask1: u32 = 0x03030303; + let kmask2: u32 = 0x0f0f0f0f; + var scale_vals: array<u32, 4>; + for (var i: u32 = 0; i < 4; i++) { + scale_vals[i] = bitcast<u32>(vec2(block.scales[2 * i], block.scales[2 * i + 1])); + } + var tmp: u32 = scale_vals[2]; + scale_vals[2] = ((scale_vals[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + scale_vals[3] = ((scale_vals[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + scale_vals[0] = (scale_vals[0] & kmask2) | ((tmp & kmask1) << 4); + scale_vals[1] = (scale_vals[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + // convert arrays of f16 -> u32 + var hmask_vals: array<u32, 8>; + for (var i: u32 = 0; i < 8; i++) { + hmask_vals[i] = bitcast<u32>(vec2(block.hmask[2 * i], block.hmask[2 * i + 1])); + } + var qs_vals: array<u32, 16>; + for (var i: u32 = 0; i < 16; i++) { + qs_vals[i] = bitcast<u32>(vec2(block.qs[2 * i], block.qs[2 * i + 1])); + } + + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + var m: u32 = 1; + // 2 halves of the block (128 elements each) + for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) { + // 4 groups (each group has 2 blocks of 16 elements) + for (var shift: u32 = 0; shift < 8; shift += 2) { + // 2 blocks + for (var k: u32 = 0; k < 32; k += 16) { + let sc = get_byte(scale_vals[is / 4], is % 4); + is++; + let dl = d * (f32(sc) - 32.0); + for (var l: u32 = 0u; l < 16u; l++) { + let q_idx = q_b_idx + k + l; + let hm_idx = k + l; + let q_byte = get_byte(qs_vals[q_idx / 4], q_idx % 4); + let hmask_byte = get_byte(hmask_vals[hm_idx / 4], hm_idx % 4); + let hm = select(4.0, 0.0, (hmask_byte & m) != 0); + let qs_val = (q_byte >> shift) & 3; + sum += ((f32(qs_val) - hm) * dl) * src1[src1_i]; + src1_i++; + } + } + m <<= 1; + } + } + return sum; +} + +#enddecl(Q3_K) + +#decl(Q4_K) +// 8 blocks of 32 elements each +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + // 2 blocks each iteration + for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) { + for (var shift: u32 = 0; shift < 8; shift += 4) { + let scale_min = get_scale_min(is, block.scales); + is++; + let dl = d * scale_min.x; + let ml = m * scale_min.y; + for (var l: u32 = 0; l < 32; l++) { + let q_idx = q_b_idx + l; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); + let qs_val = (q_byte >> shift) & 0xF; + sum += (f32(qs_val) * dl - ml) * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(Q4_K) + +#decl(Q5_K) +// 8 blocks of 32 elements each +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + var u: u32 = 1; + // 2 blocks each iteration + for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) { + for (var shift: u32 = 0; shift < 8; shift += 4) { + let scale_min = get_scale_min(is, block.scales); + is++; + let dl = d * scale_min.x; + let ml = m * scale_min.y; + for (var l: u32 = 0; l < 32; l++) { + let q_idx = q_b_idx + l; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); + let qh_byte = get_byte(block.qh[l / 4], l % 4); + let qs_val = (q_byte >> shift) & 0xF; + let qh_val = select(0.0, 16.0, (qh_byte & u) != 0); + sum += ((f32(qs_val) + qh_val) * dl - ml) * src1[src1_i]; + src1_i++; + } + u <<= 1; + } + } + return sum; +} + +#enddecl(Q5_K) + +#decl(Q6_K) +// 16 blocks of 16 elements each +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + + // convert arrays of f16 -> u32 + var ql_vals: array<u32, 32>; + for (var i: u32 = 0; i < 32; i++) { + ql_vals[i] = bitcast<u32>(vec2(block.ql[2 * i], block.ql[2 * i + 1])); + } + var qh_vals: array<u32, 16>; + for (var i: u32 = 0; i < 16; i++) { + qh_vals[i] = bitcast<u32>(vec2(block.qh[2 * i], block.qh[2 * i + 1])); + } + var scale_vals: array<u32, 4>; + for (var i: u32 = 0; i < 4; i++) { + scale_vals[i] = bitcast<u32>(vec2(block.scales[2 * i], block.scales[2 * i + 1])); + } + + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var qh_b_idx: u32 = 0; + var sc_b_idx: u32 = 0; + for (var ql_b_idx: u32 = 0; ql_b_idx < 128; ql_b_idx += 64) { + for (var l: u32 = 0; l < 32; l++) { + let ql13_b = get_byte(ql_vals[(ql_b_idx + l) / 4], (ql_b_idx + l) % 4); + let ql24_b = get_byte(ql_vals[(ql_b_idx + l + 32) / 4], (ql_b_idx + l + 32) % 4); + let qh_b = get_byte(qh_vals[(qh_b_idx + l) / 4], (qh_b_idx + l) % 4); + + let q1 = f32((ql13_b & 0xF) | ((qh_b & 3) << 4)) - 32.0; + let q2 = f32((ql24_b & 0xF) | (((qh_b >> 2) & 3) << 4)) - 32.0; + let q3 = f32((ql13_b >> 4) | (((qh_b >> 4) & 3) << 4)) - 32.0; + let q4 = f32((ql24_b >> 4) | (((qh_b >> 6) & 3) << 4)) - 32.0; + + let is = l/16; + let is1 = sc_b_idx + is; + let sc1 = get_byte_i32(scale_vals[is1 / 4], is1 % 4); + let is2 = sc_b_idx + is + 2; + let sc2 = get_byte_i32(scale_vals[is2 / 4], is2 % 4); + let is3 = sc_b_idx + is + 4; + let sc3 = get_byte_i32(scale_vals[is3 / 4], is3 % 4); + let is4 = sc_b_idx + is + 6; + let sc4 = get_byte_i32(scale_vals[is4 / 4], is4 % 4); + + sum += d * f32(sc1) * q1 * src1[src1_i + l]; + sum += d * f32(sc2) * q2 * src1[src1_i + l + 32]; + sum += d * f32(sc3) * q3 * src1[src1_i + l + 64]; + sum += d * f32(sc4) * q4 * src1[src1_i + l + 96]; + } + src1_i += 128; + qh_b_idx += 32; + sc_b_idx += 8; + } + return sum; +} + +#enddecl(Q6_K) + +#decl(IQ2_XXS) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 32; ib += 4) { + let aux0 = bitcast<u32>(vec2(block.qs[ib], block.qs[ib + 1])); + let aux1 = bitcast<u32>(vec2(block.qs[ib + 2], block.qs[ib + 3])); + let db = d * (0.5 + f32(aux1 >> 28)) * 0.25; + for (var l: u32 = 0; l < 4; l++) { + let ig = get_byte(aux0, l) * 8; + let is = (aux1 >> (7 * l)) & 127; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2xxs_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + sum += db * f32(g) * m * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ2_XXS) + +#decl(IQ2_XS) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var scale_vals = array<u32, 2>( + bitcast<u32>(vec2(block.scales[0], block.scales[1])), + bitcast<u32>(vec2(block.scales[2], block.scales[3])) + ); + var sum = 0.0; + for (var ib: u32 = 0; ib < 32; ib += 4) { + let s = get_byte(scale_vals[ib / 16], (ib % 16) / 4); + let db = array<f32, 2>( + d * (0.5 + f32(s & 0xF)) * 0.25, + d * (0.5 + f32(s >> 4)) * 0.25 + ); + for (var l: u32 = 0; l < 4; l++) { + let qs_val = bitcast<u32>(vec2(block.qs[ib + l], 0.0)); + let ig = (qs_val & 511) * 8; + let is = qs_val >> 9; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + let dl = db[l/2]; + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2xs_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + sum += dl * f32(g) * m * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ2_XS) + +#decl(IQ2_S) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var qs_vals : array<u32, 16>; + for (var i: u32 = 0; i < 16; i++) { + qs_vals[i] = bitcast<u32>(vec2(block.qs[i * 2], block.qs[i * 2 + 1])); + } + var qh_vals = array<u32, 2>( + bitcast<u32>(vec2(block.qh[0], block.qh[1])), + bitcast<u32>(vec2(block.qh[2], block.qh[3])) + ); + var scale_vals = array<u32, 2>( + bitcast<u32>(vec2(block.scales[0], block.scales[1])), + bitcast<u32>(vec2(block.scales[2], block.scales[3])) + ); + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib ++) { + let s = get_byte(scale_vals[ib / 4], ib % 4); + let db = array<f32, 2>( + d * (0.5 + f32(s & 0xF)) * 0.25, + d * (0.5 + f32(s >> 4)) * 0.25 + ); + let qs_w = qs_vals[ib]; + for (var l: u32 = 0; l < 4; l++) { + let qh_b = (get_byte(qh_vals[ib / 4], ib % 4) << (8 - 2 * l)) & 0x300; + let ig = (get_byte(qs_w, l) | qh_b) * 8; + let signs = get_byte(qs_vals[ib + 8], l); + let dl = db[l/2]; + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2s_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + sum += dl * f32(g) * m * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + + +#enddecl(IQ2_S) + +#decl(IQ3_XSS) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 16; ib += 2) { + let sc_sign = bitcast<u32>(vec2(block.qs[ib + 32], block.qs[ib + 33])); + let db = d * (0.5 + f32(sc_sign >> 28)) * 0.5; + for (var l: u32 = 0; l < 4; l++) { + let is = (sc_sign >> (7 * l)) & 127; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + let ig_val = bitcast<u32>(vec2(block.qs[ib * 2 + l], 0.0)); + let ig1 = get_byte(ig_val, 0); + let ig2 = get_byte(ig_val, 1); + for (var j: u32 = 0; j < 4; j++) { + let g1 = get_byte(iq3xxs_grid[ig1], j); + let g2 = get_byte(iq3xxs_grid[ig2], j); + let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0); + let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0); + sum += db * f32(g1) * m1 * src1[src1_i]; + sum += db * f32(g2) * m2 * src1[src1_i + 4]; + src1_i++; + } + src1_i += 4; + } + } + return sum; +} + +#enddecl(IQ3_XSS) + +#decl(IQ3_S) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var qh_vals = array<u32, 2>( + bitcast<u32>(vec2(block.qh[0], block.qh[1])), + bitcast<u32>(vec2(block.qh[2], block.qh[3])) + ); + var sign_vals: array<u32, 8>; + for (var i: u32 = 0; i < 8; i++) { + sign_vals[i] = bitcast<u32>(vec2(block.signs[i * 2], block.signs[i * 2 + 1])); + } + var scale_vals = bitcast<u32>(vec2(block.scales[0], block.scales[1])); + var sum = 0.0; + for (var ib: u32 = 0; ib < 4; ib++) { + let s = get_byte(scale_vals, ib); + let db = array<f32, 2>( + d * (1.0 + 2.0 * f32(s & 0xF)), + d * (1.0 + 2.0 * f32(s >> 4)) + ); + for (var k: u32 = 0; k < 2; k++) { + let dl = db[k]; + let qh_byte = get_byte(qh_vals[ib / 2], (ib % 2) * 2 + k); + let sign_w = sign_vals[ib * 2 + k]; + for (var l: u32 = 0; l < 4; l++) { + let signs = get_byte(sign_w, l); + let ig_val = bitcast<u32>(vec2(block.qs[ib * 8 + k * 4 + l], 0.0)); + let ig1 = get_byte(ig_val, 0) | ((qh_byte << ((8 - (2 * l)))) & 256); + let ig2 = get_byte(ig_val, 1) | ((qh_byte << ((7 - (2 * l)))) & 256); + for (var j: u32 = 0; j < 4; j++) { + let g1 = get_byte(iq3s_grid[ig1], j); + let g2 = get_byte(iq3s_grid[ig2], j); + let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0); + let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0); + sum += dl * f32(g1) * m1 * src1[src1_i]; + sum += dl * f32(g2) * m2 * src1[src1_i + 4]; + src1_i++; + } + src1_i += 4; + } + } + } + return sum; +} +#enddecl(IQ3_S) + +#decl(IQ1_S) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib++) { + let qh = bitcast<u32>(vec2(block.qh[ib], 0.0)); + let dl = d * (2 * f32((qh >> 12) & 7) + 1); + let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000) != 0); + let qs_w = bitcast<u32>(vec2(block.qs[ib * 2], block.qs[ib * 2 + 1])); + for (var l: u32 = 0; l < 4; l++) { + let ig = (get_byte(qs_w, l) | (((qh >> (3 * l)) & 7) << 8)) * 8; + for (var j: u32 = 0; j < 8; j++) { + let gw = iq1_grid[(ig + j) / 16]; + let g = (gw >> (((ig + j) % 16) * 2)) & 3; + let gs = bitcast<i32>(g << 30) >> 30; + sum += dl * (f32(gs) + delta) * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ1_S) + +#decl(IQ1_M) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + + let scale = ((block.scales[0] >> 12) & 0xF) | ((block.scales[0] >> 24) & 0x00F0) | ((block.scales[1] >> 4) & 0x0F00) | ((block.scales[1] >> 16) & 0xF000); + let d = f32(bitcast<vec2<f16>>(scale).x); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib++) { + let sw = (block.scales[ib / 4] >> (16 * ((ib / 2) % 2))) & 0xFFFF; + let s1 : u32 = (sw >> (6 * (ib % 2))) & 0x7; + let s2 : u32 = (sw >> (6 * (ib % 2) + 3)) & 0x7; + var dl = array<f32, 2>( + d * f32(2 * s1 + 1), + d * f32(2 * s2 + 1) + ); + + let qh = block.qh[ib / 2] >> (16 * (ib % 2)); + var idx = array<u32, 4>( + get_byte(block.qs[ib], 0) | ((qh << 8) & 0x700), + get_byte(block.qs[ib], 1) | ((qh << 4) & 0x700), + get_byte(block.qs[ib], 2) | ((qh) & 0x700), + get_byte(block.qs[ib], 3) | ((qh >> 4) & 0x700) + ); + var delta = array<f32, 4>( + select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x08) != 0), + select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x80) != 0), + select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x08) != 0), + select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x80) != 0) + ); + for (var l: u32 = 0; l < 4; l++) { + let ig = idx[l] * 8; + for (var j: u32 = 0; j < 8; j++) { + let gw = iq1_grid[(ig + j) / 16]; + let g = (gw >> (((ig + j) % 16) * 2)) & 3; + let gs = bitcast<i32>(g << 30) >> 30; + sum += dl[l/2] * (f32(gs) + delta[l]) * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ1_M) + +#decl(IQ4_NL) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 32; + var sum = 0.0; + var qs: array<u32, 4>; + for (var i: u32 = 0; i < 4; i++) { + qs[i] = bitcast<u32>(vec2(block.qs[i * 2], block.qs[i * 2 + 1])); + } + for (var j: u32 = 0; j < 16; j++) { + let qsb = get_byte(qs[j / 4], j % 4); + sum += d * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i]; + sum += d * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16]; + src1_i++; + } + return sum; +} + +#enddecl(IQ4_NL) + +#decl(IQ4_XS) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let scales_h = bitcast<u32>(vec2(block.scales_h, 0.0)); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib++) { + let ls = ((get_byte(block.scales_l, ib / 2) >> (4 * (ib % 2))) & 0xF) | (((scales_h >> (2 * ib)) & 3) << 4); + let dl = d * (f32(ls) - 32.0); + for (var j: u32 = 0; j < 16; j++) { + let iqs = ib * 16 + j; + let qsb = get_byte(block.qs[iqs / 4], iqs % 4); + sum += dl * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i]; + sum += dl * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16]; + src1_i++; + } + src1_i += 16; + } + return sum; +} + +#enddecl(IQ4_XS) + +#end(DECLS) + +#define(SHADER) + +enable f16; + +DECLS + +struct MulMatParams { + offset_src0: u32, // in elements/blocks + offset_src1: u32, // in elements/blocks + offset_dst: u32, // in elements/blocks + m: u32, + n: u32, + k: u32, + // all strides are in elements/blocks + stride_01: u32, + stride_11: u32, + stride_02: u32, + stride_12: u32, + stride_03: u32, + stride_13: u32, + + bs02: u32, + bs03: u32, + broadcast2: u32, + broadcast3: u32 +}; + +@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // M rows, K columns +@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // K rows, N columns (transposed) +@group(0) @binding(2) var<storage, read_write> dst: array<f32>; // M rows, N columns + +@group(0) @binding(3) var<uniform> params: MulMatParams; + +@compute @workgroup_size(256) +fn main(@builtin(global_invocation_id) global_id: vec3<u32>) { + let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3; + if (global_id.x >= total) { + return; + } + + let dst2_stride = params.m * params.n; + let dst3_stride = dst2_stride * params.bs02 * params.broadcast2; + + let dst3_idx = global_id.x / dst3_stride; + let src03_idx = dst3_idx / params.broadcast3; // src0 may be broadcast along the third dimension + let src13_idx = dst3_idx; // src1 is not broadcast + let dst3_rem = global_id.x % dst3_stride; + + let dst2_idx = dst3_rem / dst2_stride; + let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension + let src12_idx = dst2_idx; // src1 is not broadcast + + let dst2_rem = dst3_rem % dst2_stride; + + let row = dst2_rem / params.m; // output row + let col = dst2_rem % params.m; // output column + + let src0_idx_base = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01; + let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11; + + var sum = 0.0; + for (var i: u32 = 0u; i < params.k/{{BLOCK_SIZE}}; i = i + 1u) { + sum += multiply_add(src0_idx_base, src1_idx_base, i); + } + dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.m + col] = sum; +} + +#end(SHADER) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl new file mode 100644 index 0000000..109ff8d --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl @@ -0,0 +1,97 @@ +#decl(SHMEM_VEC) +fn store_shmem(val: vec4<f16>, idx: u32) { + shmem[idx] = val.x; + shmem[idx + 1] = val.y; + shmem[idx + 2] = val.z; + shmem[idx + 3] = val.w; +} +#enddecl(SHMEM_VEC) + +#decl(SHMEM_SCALAR) +fn store_shmem(val: f16, idx: u32) { + shmem[idx] = val; +} +#enddecl(SHMEM_SCALAR) + +#decl(INIT_SRC0_SHMEM_FLOAT) + +fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) { + for (var elem_idx = thread_id * {{VEC_SIZE}}; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE * {{VEC_SIZE}}) { + let tile_m = elem_idx / TILE_K; + let tile_k = elem_idx % TILE_K; + let global_m = offset_m + tile_m; + let global_k = k_outer + tile_k; + let src0_idx = batch_offset + global_m * params.stride_01 + global_k; + let src0_val = select( // taking a slight performance hit to avoid oob + {{SRC0_TYPE}}(0.0), + src0[src0_idx/{{VEC_SIZE}}], + global_m < params.m && global_k < params.k); + store_shmem({{SHMEM_TYPE}}(src0_val), elem_idx); + } +} + +#enddecl(INIT_SRC0_SHMEM_FLOAT) + +#decl(INIT_SRC1_SHMEM) + +fn init_shmem_src1(thread_id: u32, batch_offset: u32, offset_n: u32, k_outer: u32) { + for (var elem_idx = thread_id * {{VEC_SIZE}}; elem_idx < TILE_SRC1_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE * {{VEC_SIZE}}) { + let tile_n = elem_idx / TILE_K; + let tile_k = elem_idx % TILE_K; + let global_n = offset_n + tile_n; + let global_k = k_outer + tile_k; + let src1_idx = batch_offset + global_n * params.stride_11 + global_k; + let src1_val = select( + {{SRC1_TYPE}}(0.0), + src1[src1_idx/{{VEC_SIZE}}], + global_n < params.n && global_k < params.k); + store_shmem({{SHMEM_TYPE}}(src1_val), TILE_SRC0_SHMEM + elem_idx); + } +} + +#enddecl(INIT_SRC1_SHMEM) + +#decl(INIT_SRC0_SHMEM_Q4_0) + +const BLOCK_SIZE = 32u; +// the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types. +override BLOCKS_K = TILE_K/BLOCK_SIZE; +const NQ = 16u; +const F16_PER_BLOCK = 9u; // 1 scale + 8x4 packed weights +const WEIGHTS_PER_F16 = 4u; // 4 weights per f16 +const F16_PER_THREAD = NQ / WEIGHTS_PER_F16; + +fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) { + for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) { + let blck_idx = i / BLOCK_SIZE; + let block_offset = (i % BLOCK_SIZE) / WEIGHTS_PER_F16; + let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * 2u; + + let tile_m = blck_idx / BLOCKS_K; + let global_m = offset_m + tile_m; + let block_k = blck_idx % BLOCKS_K; + let global_k = k_outer / BLOCK_SIZE + block_k; + + if (global_m < params.m && global_k < params.k / BLOCK_SIZE) { + let src0_idx = batch_offset + global_m * params.stride_01 + global_k; + let scale_idx = src0_idx * F16_PER_BLOCK; + let d = src0[scale_idx]; + + for (var j = 0u; j < F16_PER_THREAD; j += 2) { + let q_0 = src0[scale_idx + 1u + block_offset + j]; + let q_1 = src0[scale_idx + 1u + block_offset + j + 1]; + + let q_packed = bitcast<u32>(vec2(q_0, q_1)); + for (var k = 0u; k < 4u; k++) { + let q_byte = get_byte(q_packed, k); + let q_hi = (f16((q_byte >> 4) & 0xF) - 8.0) * d; + let q_lo = (f16(q_byte & 0xF) - 8.0) * d; + shmem[shmem_idx + j * 2 + k] = q_lo; + shmem[shmem_idx + j * 2 + k + 16u] = q_hi; + } + } + } + } +} + +#enddecl(INIT_SRC0_SHMEM_Q4_0) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl new file mode 100644 index 0000000..6b1dd26 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl @@ -0,0 +1,247 @@ +#define(VARIANTS) +[ + { + "SHADER_SUFFIX": "f32_f32_vec", + "REPLS": { + "SRC0_TYPE" : "vec4<f32>", + "SRC1_TYPE" : "vec4<f32>", + "DST_TYPE" : "vec4<f32>", + "SHMEM_TYPE" : "vec4<f16>", + "VEC_SIZE" : 4, + }, + "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "f32_f32", + "REPLS": { + "SRC0_TYPE" : "f32", + "SRC1_TYPE" : "f32", + "DST_TYPE" : "f32", + "SHMEM_TYPE" : "f16", + "VEC_SIZE" : 1, + }, + "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "f16_f32_vec", + "REPLS": { + "SRC0_TYPE" : "vec4<f16>", + "SRC1_TYPE" : "vec4<f32>", + "DST_TYPE" : "vec4<f32>", + "SHMEM_TYPE" : "vec4<f16>", + "VEC_SIZE" : 4, + }, + "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "f16_f32", + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f32", + "DST_TYPE" : "f32", + "SHMEM_TYPE" : "f16", + "VEC_SIZE" : 1, + }, + "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "f16_f16_vec", + "REPLS": { + "SRC0_TYPE" : "vec4<f16>", + "SRC1_TYPE" : "vec4<f16>", + "DST_TYPE" : "vec4<f32>", + "SHMEM_TYPE" : "vec4<f16>", + "VEC_SIZE" : 4, + }, + "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "f16_f16", + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f16", + "DST_TYPE" : "f32", + "SHMEM_TYPE" : "f16", + "VEC_SIZE" : 1, + }, + "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "q4_0_f32_vec", + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "vec4<f32>", + "DST_TYPE" : "vec4<f32>", + "SHMEM_TYPE" : "vec4<f16>", + "VEC_SIZE" : 4, + }, + "DECLS": ["BYTE_HELPERS", "VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "q4_0_f32", + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f32", + "DST_TYPE" : "f32", + "SHMEM_TYPE" : "f16", + "VEC_SIZE" : 1, + }, + "DECLS": ["BYTE_HELPERS", "SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"] + } +] + +#end(VARIANTS) + +#define(DECLS) + +#decl(VEC) +fn store_val(acc: array<array<f16, TILE_N>, TILE_M>, tn: u32, tm: u32) -> vec4<f32> { + return vec4<f32>(f32(acc[tm][tn]), f32(acc[tm + 1][tn]), f32(acc[tm + 2][tn]), f32(acc[tm + 3][tn])); +} +#enddecl(VEC) + +#decl(SCALAR) +fn store_val(acc: array<array<f16, TILE_N>, TILE_M>, tn: u32, tm: u32) -> f32 { + return f32(acc[tm][tn]); +} +#enddecl(SCALAR) + +#end(DECLS) + +#define(SHADER) +enable f16; + +struct MulMatParams { + offset_src0: u32, + offset_src1: u32, + offset_dst: u32, + m: u32, + n: u32, + k: u32, + stride_01: u32, + stride_11: u32, + stride_02: u32, + stride_12: u32, + stride_03: u32, + stride_13: u32, + bs02: u32, + bs03: u32, + broadcast2: u32, + broadcast3: u32 +}; + +@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // M rows, K columns +@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // K rows, N columns (transposed) +@group(0) @binding(2) var<storage, read_write> dst: array<{{DST_TYPE}}>; // M rows, N columns (transposed) + +@group(0) @binding(3) var<uniform> params: MulMatParams; + +DECLS + +fn get_local_n(thread_id: u32) -> u32 { + return thread_id / WORKGROUP_SIZE_M; +} +fn get_local_m(thread_id: u32) -> u32 { + return thread_id % WORKGROUP_SIZE_M; +} + +// TILE_M must be multiple of 4 for vec4 loads +const TILE_M = {{WEBGPU_TILE_M}}u; +const TILE_N = {{WEBGPU_TILE_N}}u; + +override WORKGROUP_SIZE_M: u32; +override WORKGROUP_SIZE_N: u32; +override TILE_K: u32; + +override TOTAL_WORKGROUP_SIZE = WORKGROUP_SIZE_M * WORKGROUP_SIZE_N; +override TILE_SRC0_SHMEM = TILE_K * WORKGROUP_SIZE_M * TILE_M; +override TILE_SRC1_SHMEM = TILE_K * WORKGROUP_SIZE_N * TILE_N; + +var<workgroup> shmem: array<f16, TILE_SRC0_SHMEM + TILE_SRC1_SHMEM>; + +@compute @workgroup_size(TOTAL_WORKGROUP_SIZE) +fn main(@builtin(workgroup_id) wg_id: vec3<u32>, + @builtin(local_invocation_id) local_id: vec3<u32>) { + + let thread_id = local_id.x; + let local_m = get_local_m(thread_id); + let local_n = get_local_n(thread_id); + + let wg_n_count = (params.n + WORKGROUP_SIZE_N * TILE_N - 1u) / (WORKGROUP_SIZE_N * TILE_N); + let wg_m_count = (params.m + WORKGROUP_SIZE_M * TILE_M - 1u) / (WORKGROUP_SIZE_M * TILE_M); + let wg_per_matrix = wg_m_count * wg_n_count; + + let batch_idx = wg_id.x / wg_per_matrix; + + let wg_in_batch = wg_id.x % wg_per_matrix; + let wg_m = wg_in_batch % wg_m_count; + let wg_n = wg_in_batch / wg_m_count; + + let output_row_base = wg_m * WORKGROUP_SIZE_M * TILE_M + local_m * TILE_M; + let output_col_base = wg_n * WORKGROUP_SIZE_N * TILE_N + local_n * TILE_N; + + let dst2_stride = params.m * params.n; + let dst3_stride = dst2_stride * params.bs02 * params.broadcast2; + + let dst3_idx = batch_idx / (params.bs02 * params.broadcast2); + let src03_idx = dst3_idx / params.broadcast3; + let src13_idx = dst3_idx; + let dst2_idx = batch_idx % (params.bs02 * params.broadcast2); + let src02_idx = dst2_idx / params.broadcast2; + let src12_idx = dst2_idx; + + let src0_batch_offset = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02; + let src1_batch_offset = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12; + + let offset_m = wg_m * WORKGROUP_SIZE_M * TILE_M; + let offset_n = wg_n * WORKGROUP_SIZE_N * TILE_N; + + var acc: array<array<f16, TILE_N>, TILE_M>; + + for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) { + + // see mul_mat_decls.tmpl + init_shmem_src0(thread_id, src0_batch_offset, offset_m, k_outer); + init_shmem_src1(thread_id, src1_batch_offset, offset_n, k_outer); + + workgroupBarrier(); + + let k_end = min(TILE_K, params.k - k_outer); + + for (var k_inner = 0u; k_inner < k_end; k_inner++) { + var src0_tile: array<f16, TILE_M>; + for (var tm = 0u; tm < TILE_M; tm++) { + let src0_m = local_m * TILE_M + tm; + let src0_idx = k_inner + src0_m * TILE_K; + src0_tile[tm] = shmem[src0_idx]; + } + for (var tn = 0u; tn < TILE_N; tn++) { + let src1_n = local_n * TILE_N + tn; + let src1_idx = src1_n * TILE_K + k_inner; + let src1_val = shmem[TILE_SRC0_SHMEM + src1_idx]; + for (var tm = 0u; tm < TILE_M; tm++) { + acc[tm][tn] += src0_tile[tm] * src1_val; + } + } + } + + workgroupBarrier(); + } + + let dst_batch_offset = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride; + + for (var tn = 0u; tn < TILE_N; tn++) { + let global_col = output_col_base + tn; + if (global_col < params.n) { + for (var tm = 0u; tm < TILE_M; tm += {{VEC_SIZE}}) { + let global_row = output_row_base + tm; + if (global_row < params.m) { + let dst_idx = dst_batch_offset + global_col * params.m + global_row; + dst[dst_idx/{{VEC_SIZE}}] = store_val(acc, tn, tm); + } + } + } + } +} + +#end(SHADER) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl new file mode 100644 index 0000000..47c8ce3 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl @@ -0,0 +1,302 @@ +#define(VARIANTS) +[ + { + "SHADER_SUFFIX": "f32_f32_vec", + "REPLS": { + "SRC0_TYPE" : "vec4<f32>", + "SRC1_TYPE" : "vec4<f32>", + "DST_TYPE" : "vec4<f32>", + "SHMEM_TYPE" : "vec4<f16>", + "VEC_SIZE" : 4, + }, + "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "f32_f32", + "REPLS": { + "SRC0_TYPE" : "f32", + "SRC1_TYPE" : "f32", + "DST_TYPE" : "f32", + "SHMEM_TYPE" : "f16", + "VEC_SIZE" : 1, + }, + "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "f16_f32_vec", + "REPLS": { + "SRC0_TYPE" : "vec4<f16>", + "SRC1_TYPE" : "vec4<f32>", + "DST_TYPE" : "vec4<f32>", + "SHMEM_TYPE" : "vec4<f16>", + "VEC_SIZE" : 4, + }, + "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "f16_f32", + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f32", + "DST_TYPE" : "f32", + "SHMEM_TYPE" : "f16", + "VEC_SIZE" : 1, + }, + "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "f16_f16_vec", + "REPLS": { + "SRC0_TYPE" : "vec4<f16>", + "SRC1_TYPE" : "vec4<f16>", + "DST_TYPE" : "vec4<f32>", + "SHMEM_TYPE" : "vec4<f16>", + "VEC_SIZE" : 4, + }, + "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "f16_f16", + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f16", + "DST_TYPE" : "f32", + "SHMEM_TYPE" : "f16", + "VEC_SIZE" : 1, + }, + "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "q4_0_f32_vec", + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "vec4<f32>", + "DST_TYPE" : "vec4<f32>", + "SHMEM_TYPE" : "vec4<f16>", + "VEC_SIZE" : 4, + }, + "DECLS": ["BYTE_HELPERS", "VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"] + }, + { + "SHADER_SUFFIX": "q4_0_f32", + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f32", + "DST_TYPE" : "f32", + "SHMEM_TYPE" : "f16", + "VEC_SIZE" : 1, + }, + "DECLS": ["BYTE_HELPERS", "SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"] + } +] + +#end(VARIANTS) + +#define(DECLS) + +#decl(VEC) +fn store_dst(shmem_idx: u32, dst_idx: u32) { + dst[dst_idx] = vec4<f32>( + f32(shmem[shmem_idx]), + f32(shmem[shmem_idx + 1]), + f32(shmem[shmem_idx + 2]), + f32(shmem[shmem_idx + 3]) + ); +} +#enddecl(VEC) + +#decl(SCALAR) +fn store_dst(shmem_idx: u32, dst_idx: u32) { + dst[dst_idx] = f32(shmem[shmem_idx]); +} +#enddecl(SCALAR) + +#end(DECLS) + +#define(SHADER) +diagnostic(off, chromium.subgroup_matrix_uniformity); +enable f16; +enable subgroups; +enable chromium_experimental_subgroup_matrix; + +struct MulMatParams { + offset_src0: u32, + offset_src1: u32, + offset_dst: u32, + m: u32, + n: u32, + k: u32, + stride_01: u32, + stride_11: u32, + stride_02: u32, + stride_12: u32, + stride_03: u32, + stride_13: u32, + bs02: u32, + bs03: u32, + broadcast2: u32, + broadcast3: u32 +}; + +@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // M rows, K columns +@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // K rows, N columns (transposed) +@group(0) @binding(2) var<storage, read_write> dst: array<{{DST_TYPE}}>; // M rows, N columns (transposed) + +@group(0) @binding(3) var<uniform> params: MulMatParams; + +DECLS + +// Note: These are string interpolated at build time, cannot use override constants due to limitations in +// current Dawn version type definitions/matrix load requirements for constant memory sizes. +const SUBGROUP_M = {{WEBGPU_SUBGROUP_M}}u; +const SUBGROUP_N = {{WEBGPU_SUBGROUP_N}}u; +// For portability we assume the max subgroup size, meaning some subgroups will be masked out if the +// runtime subgroup size is smaller. +const MAX_SUBGROUP_SIZE = {{WEBGPU_MAX_SUBGROUP_SIZE}}u; + +const EXPECTED_SUBGROUPS = SUBGROUP_M * SUBGROUP_N; + +const SUBGROUP_MATRIX_M_SIZE = {{WEBGPU_SG_MAT_M_SIZE}}u; +const SUBGROUP_MATRIX_N_SIZE = {{WEBGPU_SG_MAT_N_SIZE}}u; +const SUBGROUP_MATRIX_K_SIZE = {{WEBGPU_SG_MAT_K_SIZE}}u; + +const SUBGROUP_MATRIX_M = {{WEBGPU_SUBGROUP_MATRIX_M}}u; +const SUBGROUP_MATRIX_N = {{WEBGPU_SUBGROUP_MATRIX_N}}u; + +const TILE_K = {{WEBGPU_TILE_K}}u; + +const WG_M_SG_TILE_SIZE = SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE; +const WG_N_SG_TILE_SIZE = SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE; + +const TOTAL_WORKGROUP_SIZE = SUBGROUP_M * SUBGROUP_N * MAX_SUBGROUP_SIZE; +const TILE_SRC0_SHMEM = TILE_K * SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE; +const TILE_SRC1_SHMEM = TILE_K * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE; + +const SG_MAT_ACCUM_SHMEM = SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_M_SIZE * SUBGROUP_MATRIX_N_SIZE; + +// We reuse shmem for accumulation matrices +const SHMEM_SIZE = max(TILE_SRC0_SHMEM + TILE_SRC1_SHMEM, SG_MAT_ACCUM_SHMEM); + +var<workgroup> shmem: array<f16, SHMEM_SIZE>; + +@compute @workgroup_size(TOTAL_WORKGROUP_SIZE) +fn main(@builtin(workgroup_id) wg_id: vec3<u32>, + @builtin(local_invocation_id) local_id: vec3<u32>, + @builtin(subgroup_id) subgroup_id: u32) { + + let thread_id = local_id.x; + let subgroup_m = subgroup_id % SUBGROUP_M; + let subgroup_n = subgroup_id / SUBGROUP_M; + + let wg_m_count = (params.m + WG_M_SG_TILE_SIZE - 1) / WG_M_SG_TILE_SIZE; + let wg_n_count = (params.n + WG_N_SG_TILE_SIZE - 1) / WG_N_SG_TILE_SIZE; + let wg_per_matrix = wg_m_count * wg_n_count; + + let batch_idx = wg_id.x / wg_per_matrix; + + let wg_in_batch = wg_id.x % wg_per_matrix; + let wg_m = wg_in_batch % wg_m_count; + let wg_n = wg_in_batch / wg_m_count; + + let dst2_stride = params.m * params.n; + let dst3_stride = dst2_stride * params.bs02 * params.broadcast2; + + let dst3_idx = batch_idx / (params.bs02 * params.broadcast2); + let src03_idx = dst3_idx / params.broadcast3; + let src13_idx = dst3_idx; + let dst2_idx = batch_idx % (params.bs02 * params.broadcast2); + let src02_idx = dst2_idx / params.broadcast2; + let src12_idx = dst2_idx; + + let src0_batch_offset = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02; + let src1_batch_offset = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12; + + let offset_m = wg_m * SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE; + let offset_n = wg_n * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE; + + var acc_sg_mat : array<array<subgroup_matrix_result<f16, SUBGROUP_MATRIX_N_SIZE, SUBGROUP_MATRIX_M_SIZE>, SUBGROUP_MATRIX_N>, SUBGROUP_MATRIX_M>; + + for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) { + + // see mul_mat_decls.tmpl + init_shmem_src0(thread_id, src0_batch_offset, offset_m, k_outer); + init_shmem_src1(thread_id, src1_batch_offset, offset_n, k_outer); + + workgroupBarrier(); + + if (subgroup_id < EXPECTED_SUBGROUPS) { + + for (var k_inner = 0u; k_inner < TILE_K; k_inner += SUBGROUP_MATRIX_K_SIZE) { + + let src0_shmem_idx_base = subgroup_m * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE * TILE_K + k_inner; + var src0_sg_mats: array<subgroup_matrix_left<f16, SUBGROUP_MATRIX_K_SIZE, SUBGROUP_MATRIX_M_SIZE>, SUBGROUP_MATRIX_M>; + for (var m = 0u; m < SUBGROUP_MATRIX_M; m++) { + src0_sg_mats[m] = subgroupMatrixLoad<subgroup_matrix_left<f16, SUBGROUP_MATRIX_K_SIZE, SUBGROUP_MATRIX_M_SIZE>>( + &shmem, + src0_shmem_idx_base + m * SUBGROUP_MATRIX_M_SIZE * TILE_K, + false, + TILE_K + ); + } + + let src1_shmem_idx_base = TILE_SRC0_SHMEM + subgroup_n * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE * TILE_K + k_inner; + for (var n = 0u; n < SUBGROUP_MATRIX_N; n++) { + let src1_sg_mat = subgroupMatrixLoad<subgroup_matrix_right<f16, SUBGROUP_MATRIX_N_SIZE, SUBGROUP_MATRIX_K_SIZE>>( + &shmem, + src1_shmem_idx_base + n * SUBGROUP_MATRIX_N_SIZE * TILE_K, + true, + TILE_K + ); + for (var m = 0u; m < SUBGROUP_MATRIX_M; m++) { + acc_sg_mat[m][n] = subgroupMatrixMultiplyAccumulate(src0_sg_mats[m], src1_sg_mat, acc_sg_mat[m][n]); + } + } + } + } + + workgroupBarrier(); + } + + let dst_batch_offset = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride; + + // Stage the subgroup matrix tiles into shared memory + // This uses WG_M_SG_TILE_SIZE as the stride (number of columns in the workgroup tile). + let WG_TILE_STRIDE = WG_M_SG_TILE_SIZE; + let tile_row_base_local = subgroup_n * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE; + let tile_col_base_local = subgroup_m * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE; + + if (subgroup_id < EXPECTED_SUBGROUPS) { // 2-5% performance hit :( + for (var n = 0u; n < SUBGROUP_MATRIX_N; n++) { + for (var m = 0u; m < SUBGROUP_MATRIX_M; m++) { + let local_row = tile_row_base_local + n * SUBGROUP_MATRIX_N_SIZE; + let local_col = tile_col_base_local + m * SUBGROUP_MATRIX_M_SIZE; + let out_base = local_row * WG_TILE_STRIDE + local_col; + subgroupMatrixStore(&shmem, out_base, acc_sg_mat[m][n], true, WG_TILE_STRIDE); + } + } + } + + workgroupBarrier(); + + // Cooperative write: iterate over the entire workgroup tile + let tile_rows = WG_N_SG_TILE_SIZE; + let tile_cols = WG_M_SG_TILE_SIZE; + let total_tile_elems = tile_rows * tile_cols; + let tile_dst_row_base = wg_m * SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE; + let tile_dst_col_base = wg_n * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE; + + for (var idx = thread_id * {{VEC_SIZE}}; idx < total_tile_elems; idx += TOTAL_WORKGROUP_SIZE * {{VEC_SIZE}}) { + let local_row = idx % WG_TILE_STRIDE; + let local_col = idx / WG_TILE_STRIDE; + + let global_row = tile_dst_row_base + local_row; + let global_col = tile_dst_col_base + local_col; + + if (global_col < params.n && global_row < params.m) { + let dst_idx = dst_batch_offset + global_col * params.m + global_row; + store_dst(idx, dst_idx/{{VEC_SIZE}}); + } + } +} + +#end(SHADER) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl new file mode 100644 index 0000000..ffbb640 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl @@ -0,0 +1,267 @@ +#define(VARIANTS) +[ + { + "SHADER_SUFFIX": "f32_f32_vec", + "REPLS": { + "SRC0_TYPE" : "vec4<f32>", + "SRC1_TYPE" : "vec4<f32>", + "DST_TYPE": "vec4<f32>", + "VEC_SIZE" : 4, + }, + "DECLS": ["VEC", "MUL_ACC_FLOAT"] + }, + { + "SHADER_SUFFIX": "f32_f32", + "REPLS": { + "SRC0_TYPE" : "f32", + "SRC1_TYPE" : "f32", + "DST_TYPE": "f32", + "VEC_SIZE" : 1, + }, + "DECLS": ["SCALAR", "MUL_ACC_FLOAT"] + }, + { + "SHADER_SUFFIX": "f16_f32_vec", + "REPLS": { + "SRC0_TYPE" : "vec4<f16>", + "SRC1_TYPE" : "vec4<f32>", + "DST_TYPE": "vec4<f32>", + "VEC_SIZE" : 4, + }, + "DECLS": ["VEC", "MUL_ACC_FLOAT"] + }, + { + "SHADER_SUFFIX": "f16_f32", + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f32", + "DST_TYPE": "f32", + "VEC_SIZE" : 1, + }, + "DECLS": ["SCALAR", "MUL_ACC_FLOAT"] + }, + { + "SHADER_SUFFIX": "f16_f16_vec", + "REPLS": { + "SRC0_TYPE" : "vec4<f16>", + "SRC1_TYPE" : "vec4<f16>", + "DST_TYPE": "vec4<f32>", + "VEC_SIZE" : 4, + }, + "DECLS": ["VEC", "MUL_ACC_FLOAT"] + }, + { + "SHADER_SUFFIX": "f16_f16", + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f16", + "DST_TYPE": "f32", + "VEC_SIZE" : 1, + }, + "DECLS": ["SCALAR", "MUL_ACC_FLOAT"] + }, + { + "SHADER_SUFFIX": "q4_0_f32", + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f32", + "DST_TYPE": "f32", + "VEC_SIZE" : 1, + }, + "DECLS": ["BYTE_HELPERS", "SCALAR", "MUL_ACC_Q4_0"] + } +] + +#end(VARIANTS) + +#define(DECLS) + +#decl(VEC) +fn inner_dot(src0_val: {{SRC0_TYPE}}, src1_val: {{SRC1_TYPE}}) -> f32 { + return f32(dot({{SRC1_TYPE}}(src0_val), src1_val)); +} + +fn store_val(group_base: u32) -> vec4<f32> { + return vec4<f32>(partial_sums[group_base], + partial_sums[group_base + THREADS_PER_OUTPUT], + partial_sums[group_base + THREADS_PER_OUTPUT * 2], + partial_sums[group_base + THREADS_PER_OUTPUT * 3]); +} +#enddecl(VEC) + +#decl(SCALAR) +fn inner_dot(src0_val: {{SRC0_TYPE}}, src1_val: {{SRC1_TYPE}}) -> f32 { + return f32(src0_val) * f32(src1_val); +} + +fn store_val(group_base: u32) -> f32 { + return partial_sums[group_base]; +} +#enddecl(SCALAR) + +#decl(MUL_ACC_FLOAT) + +fn mul_acc(tig:u32, tile_size: u32, idx_base: u32, k_outer: u32) -> f32 { + var local_sum = 0.0; + for (var i = tig * {{VEC_SIZE}}; i < tile_size; i += THREADS_PER_OUTPUT * {{VEC_SIZE}}) { + let a = src0[(idx_base + k_outer + i) / {{VEC_SIZE}}]; + let b = shared_vector[i / {{VEC_SIZE}}]; + local_sum += inner_dot(a, b); + } + return local_sum; +} + +#enddecl(MUL_ACC_FLOAT) + +#decl(MUL_ACC_Q4_0) + +const BLOCK_SIZE = 32; +const NQ = 16u; // number of weights per thread +const F16_PER_BLOCK = 9u; // 1 scale + 8x4 packed weights +const WEIGHTS_PER_F16 = 4u; // 4 weights per f16 +const F16_PER_THREAD = NQ / WEIGHTS_PER_F16; + +fn mul_acc(tig:u32, tile_size: u32, idx_base: u32, k_outer: u32) -> f32 { + var local_sum = 0.0; + for (var i = tig * NQ; i < tile_size; i += THREADS_PER_OUTPUT * NQ) { + let blck_idx = i / BLOCK_SIZE; + let block_offset = (i % BLOCK_SIZE) / WEIGHTS_PER_F16; + let scale_idx = (idx_base + k_outer / BLOCK_SIZE + blck_idx) * F16_PER_BLOCK; + // each f16 contains offsets [block_offset, block_offset + 1] and [block_offset + 16, block_offset + 17] + let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * 2u; + let d = f32(src0[scale_idx]); + for (var j = 0u; j < F16_PER_THREAD; j += 2) { + let q_0 = src0[scale_idx + 1 + block_offset + j]; + let q_1 = src0[scale_idx + 1 + block_offset + j + 1]; + let q_packed = bitcast<u32>(vec2(q_0, q_1)); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0) * d; + let q_lo = (f32(q_byte & 0xF) - 8.0) * d; + local_sum += q_lo * shared_vector[shmem_idx + j * 2 + k]; + local_sum += q_hi * shared_vector[shmem_idx + j * 2 + k + 16]; + } + } + } + return local_sum; +} + +#enddecl(MUL_ACC_Q4_0) + +#end(DECLS) + +#define(SHADER) +enable f16; + +DECLS + +struct MulMatParams { + offset_src0: u32, + offset_src1: u32, + offset_dst: u32, + m: u32, + n: u32, + k: u32, + stride_01: u32, + stride_11: u32, + stride_02: u32, + stride_12: u32, + stride_03: u32, + stride_13: u32, + bs02: u32, + bs03: u32, + broadcast2: u32, + broadcast3: u32 +}; + +@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // Matrix (M x K) +@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // Vector (K x 1, transposed) +@group(0) @binding(2) var<storage, read_write> dst: array<{{DST_TYPE}}>; // Result vector (transposed) + +@group(0) @binding(3) var<uniform> params: MulMatParams; + +override WORKGROUP_SIZE: u32; +override TILE_K: u32; +override OUTPUTS_PER_WG: u32; +override THREADS_PER_OUTPUT = WORKGROUP_SIZE / OUTPUTS_PER_WG; + +// Shared memory for collaborative loading and reduction +var<workgroup> shared_vector: array<{{SRC1_TYPE}}, TILE_K/{{VEC_SIZE}}>; // Cache vector tile +var<workgroup> partial_sums: array<f32, WORKGROUP_SIZE>; // For reduction + +@compute @workgroup_size(WORKGROUP_SIZE) +fn main( + @builtin(local_invocation_id) local_id: vec3<u32>, + @builtin(workgroup_id) wg_id: vec3<u32>, + @builtin(num_workgroups) num_wg: vec3<u32>) { + let thread_id = local_id.x; + + // Handle batch dimensions + let total_batches = params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3; + let wg_linear = wg_id.y * num_wg.x + wg_id.x; + let output_groups = (params.m + OUTPUTS_PER_WG - 1u) / OUTPUTS_PER_WG; + let batch_idx = wg_linear / output_groups; + if (batch_idx >= total_batches) { + return; + } + + // Which of the outputs does this thread belong to? + let thread_group = thread_id / THREADS_PER_OUTPUT; + let thread_in_group = thread_id % THREADS_PER_OUTPUT; + + // Each workgroup computes OUTPUTS_PER_WG consecutive outputs + let output_row = (wg_linear % output_groups) * OUTPUTS_PER_WG + thread_group; + + let dst2_stride = params.m * params.n; + let dst2_idx = batch_idx % (params.bs02 * params.broadcast2); + let dst3_stride = dst2_stride * params.bs02 * params.broadcast2; + let dst3_idx = batch_idx / (params.bs02 * params.broadcast2); + let src03_idx = dst3_idx / params.broadcast3; + let src13_idx = dst3_idx; + let src02_idx = dst2_idx / params.broadcast2; + let src12_idx = dst2_idx; + + let src0_idx_base = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02 + output_row * params.stride_01; + let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12; + let dst_idx = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + output_row; + + var local_sum = 0.0; + + // Each thread processes multiple K elements and accumulates + for (var k_tile = 0u; k_tile < params.k; k_tile += TILE_K) { + let tile_size = min(TILE_K, params.k - k_tile); + + // Cooperatively load vector tile into shared memory (all threads) + for (var i = thread_id * {{VEC_SIZE}}; i < tile_size; i += WORKGROUP_SIZE * {{VEC_SIZE}}) { + shared_vector[i / {{VEC_SIZE}}] = src1[(src1_idx_base + k_tile + i) / {{VEC_SIZE}}]; + } + + workgroupBarrier(); + + if (output_row < params.m) { + local_sum += mul_acc(thread_in_group, tile_size, src0_idx_base, k_tile); + } + + workgroupBarrier(); + } + + // Store partial sums and reduce within each partition + partial_sums[thread_id] = local_sum; + workgroupBarrier(); + let group_base = thread_group * THREADS_PER_OUTPUT; + let thread_base = group_base + thread_in_group; + var offset = THREADS_PER_OUTPUT / 2; + while (offset > 0) { + if (thread_in_group < offset) { + partial_sums[thread_base] += partial_sums[thread_base + offset]; + } + offset = offset / 2; + workgroupBarrier(); + } + + // Store back to global memory + if (output_row < params.m && thread_group % {{VEC_SIZE}} == 0 && thread_in_group == 0) { + dst[dst_idx / {{VEC_SIZE}}] = store_val(group_base); + } +} +#end(SHADER) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl new file mode 100644 index 0000000..ea63b9a --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl @@ -0,0 +1,86 @@ +@group(0) @binding(0) +var<storage, read_write> src: array<f32>; + +@group(0) @binding(1) +var<storage, read_write> dst: array<f32>; + +struct Params { + ne: u32, // total number of elements + offset_src: u32, // in elements + offset_dst: u32, // in elements + + // Strides (in elements) + stride_src0: u32, + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + // Logical shapes + src_ne0: u32, + src_ne1: u32, + src_ne2: u32, + src_ne3: u32, + + dst_ne0: u32, + dst_ne1: u32, + dst_ne2: u32, + dst_ne3: u32, + + // Pad sizes (in elements) + lp0: u32, + rp0: u32, + lp1: u32, + rp1: u32, + lp2: u32, + rp2: u32, + lp3: u32, + rp3: u32, +}; + +@group(0) @binding(2) +var<uniform> params: Params; + +fn wrap_around(idx: i32, n: u32) -> u32 { + return u32(idx + i32(n)) % n; +} + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(global_invocation_id) gid: vec3<u32>) { + if (gid.x >= params.ne) { + return; + } + + var i = gid.x; + let dst_plane = params.dst_ne2 * params.dst_ne1 * params.dst_ne0; + let i3 = i / dst_plane; + i = i % dst_plane; + let i2 = i / (params.dst_ne1 * params.dst_ne0); + i = i % (params.dst_ne1 * params.dst_ne0); + let i1 = i / params.dst_ne0; + let i0 = i % params.dst_ne0; + + var value: f32 = 0.0; + +#ifdef CIRCULAR + let ci0 = wrap_around(i32(i0) - i32(params.lp0), params.src_ne0); + let ci1 = wrap_around(i32(i1) - i32(params.lp1), params.src_ne1); + let ci2 = wrap_around(i32(i2) - i32(params.lp2), params.src_ne2); + let ci3 = wrap_around(i32(i3) - i32(params.lp3), params.src_ne3); + let circular_src_idx = ci0 * params.stride_src0 + ci1 * params.stride_src1 + + ci2 * params.stride_src2 + ci3 * params.stride_src3; + value = src[params.offset_src + circular_src_idx]; +#else + let is_src = + (i0 >= params.lp0 && i0 < params.dst_ne0 - params.rp0) && + (i1 >= params.lp1 && i1 < params.dst_ne1 - params.rp1) && + (i2 >= params.lp2 && i2 < params.dst_ne2 - params.rp2) && + (i3 >= params.lp3 && i3 < params.dst_ne3 - params.rp3); + if (is_src) { + let src_idx = (i0 - params.lp0) * params.stride_src0 + (i1 - params.lp1) * params.stride_src1 + + (i2 - params.lp2) * params.stride_src2 + (i3 - params.lp3) * params.stride_src3; + value = src[params.offset_src + src_idx]; + } +#endif + + dst[params.offset_dst + gid.x] = value; +} diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl new file mode 100644 index 0000000..712b921 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl @@ -0,0 +1,123 @@ +#define(VARIANTS) + +[ + { + "DECLS": ["NOT_INPLACE"] + }, + { + "SHADER_SUFFIX": "inplace", + "DECLS": ["INPLACE"] + }, +] + +#end(VARIANTS) + +#define(DECLS) + +#decl(NOT_INPLACE) + +fn update(src_offset: u32, dst_offset: u32, scale: f32) { + dst[dst_offset] = scale * src[src_offset]; +} + +@group(0) @binding(1) +var<storage, read_write> dst: array<f32>; + +@group(0) @binding(2) +var<uniform> params: Params; + +#enddecl(NOT_INPLACE) + +#decl(INPLACE) + +fn update(src_offset: u32, dst_offset: u32, scale: f32) { + src[dst_offset] = scale * src[src_offset]; +} + +@group(0) @binding(1) +var<uniform> params: Params; + +#enddecl(INPLACE) + +#end(DECLS) + +#define(SHADER) + +struct Params { + offset_src: u32, // in elements + offset_dst: u32, // in elements + + // Strides (in elements) + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + // Shape of src/dst + ne0: u32, + ne1: u32, + ne2: u32, + ne3: u32, + + eps: f32 +}; + +@group(0) @binding(0) +var<storage, read_write> src: array<f32>; + +DECLS + +override wg_size: u32; +var<workgroup> scratch: array<f32, wg_size>; + +@compute @workgroup_size(wg_size) +fn main(@builtin(workgroup_id) wid: vec3<u32>, + @builtin(local_invocation_id) lid: vec3<u32>) { + + // one thread per row + var i = wid.x; + let i3 = i / (params.ne2 * params.ne1); + i = i % (params.ne2 * params.ne1); + let i2 = i / params.ne1; + let i1 = i % params.ne1; + let i_src_row = params.offset_src + i3 * params.stride_src3 + i2 * params.stride_src2 + i1 * params.stride_src1; + let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1; + + let elems = (params.ne0 + wg_size - 1) / wg_size; + + var sum = 0.0f; + var col = lid.x; + for (var j: u32 = 0; j < elems; j++) { + if (col >= params.ne0) { + break; + } + sum += pow(src[i_src_row + col], 2.0); + col += wg_size; + } + + scratch[lid.x] = sum; + workgroupBarrier(); + var offset = wg_size / 2; + while (offset > 0) { + if (lid.x < offset) { + scratch[lid.x] += scratch[lid.x + offset]; + } + offset = offset / 2; + workgroupBarrier(); + } + sum = scratch[0]; + + let scale = 1.0/sqrt(sum/f32(params.ne0) + params.eps); + col = lid.x; + for (var j: u32 = 0; j < elems; j++) { + if (col >= params.ne0) { + break; + } + update(i_src_row + col, i_dst_row + col, scale); + col += wg_size; + } +} +#end(SHADER) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl new file mode 100644 index 0000000..84dc8db --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl @@ -0,0 +1,295 @@ +#define(VARIANTS) + +[ + { + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["NO_FF_BINDINGS", "NO_FF_FUNC", "ROTATE"] + }, + { + "SHADER_SUFFIX": "f32_inplace", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["NO_FF_BINDINGS_INPLACE", "NO_FF_FUNC", "ROTATE_INPLACE"] + }, + { + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["NO_FF_BINDINGS", "NO_FF_FUNC", "ROTATE"] + }, + { + "SHADER_SUFFIX": "f16_inplace", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["NO_FF_BINDINGS_INPLACE", "NO_FF_FUNC", "ROTATE_INPLACE"] + }, + { + "SHADER_SUFFIX": "f32_ff", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["FF_BINDINGS", "FF_FUNC", "ROTATE"] + }, + { + "SHADER_SUFFIX": "f32_ff_inplace", + "REPLS": { + "TYPE" : "f32", + }, + "DECLS": ["FF_BINDINGS_INPLACE", "FF_FUNC", "ROTATE_INPLACE"] + }, + { + "SHADER_SUFFIX": "f16_ff", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["FF_BINDINGS", "FF_FUNC", "ROTATE"] + }, + { + "SHADER_SUFFIX": "f16_ff_inplace", + "REPLS": { + "TYPE" : "f16", + }, + "DECLS": ["FF_BINDINGS_INPLACE", "FF_FUNC", "ROTATE_INPLACE"] + } +] + +#end(VARIANTS) + +#define(DECLS) + +#decl(ROTATE) +fn rotate(i_dst0: u32, i_dst1: u32, out0: f32, out1: f32) { + dst[i_dst0] = {{TYPE}}(out0); + dst[i_dst1] = {{TYPE}}(out1); +} +#enddecl(ROTATE) + +#decl(ROTATE_INPLACE) +fn rotate(i_dst0: u32, i_dst1: u32, out0: f32, out1: f32) { + src0[i_dst0] = {{TYPE}}(out0); + src0[i_dst1] = {{TYPE}}(out1); +} +#enddecl(ROTATE_INPLACE) + +#decl(NO_FF_FUNC) +fn freq_factor(i: u32) -> f32 { + return 1.0f; +} +#enddecl(NO_FF_FUNC) + +#decl(FF_FUNC) +fn freq_factor(i: u32) -> f32 { + return src2[params.offset_src2 + i/2]; +} +#enddecl(FF_FUNC) + +#decl(NO_FF_BINDINGS) + +@group(0) @binding(2) +var<storage, read_write> dst: array<{{TYPE}}>; + +@group(0) @binding(3) +var<uniform> params: Params; + +#enddecl(NO_FF_BINDINGS) + +#decl(NO_FF_BINDINGS_INPLACE) + +@group(0) @binding(2) +var<uniform> params: Params; + +#enddecl(NO_FF_BINDINGS_INPLACE) + +#decl(FF_BINDINGS) + +@group(0) @binding(2) +var<storage, read_write> src2: array<f32>; + +@group(0) @binding(3) +var<storage, read_write> dst: array<{{TYPE}}>; + +@group(0) @binding(4) +var<uniform> params: Params; + +#enddecl(FF_BINDINGS) + +#decl(FF_BINDINGS_INPLACE) + +@group(0) @binding(2) +var<storage, read_write> src2: array<f32>; + +@group(0) @binding(3) +var<uniform> params: Params; + +#enddecl(FF_BINDINGS_INPLACE) + +#end(DECLS) + +#define(SHADER) + +enable f16; + +struct Params { + offset_src0: u32, + offset_src1: u32, + offset_src2: u32, + offset_dst: u32, + + // Strides (in elements) + stride_src01: u32, + stride_src02: u32, + stride_src03: u32, + + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + n_threads: u32, + ne0: u32, + ne1: u32, + ne2: u32, + + n_dims: u32, + mode: u32, + theta_scale: f32, + attn_factor: f32, + freq_scale: f32, + ext_factor: f32, + corr_dim0: f32, + corr_dim1: f32, + sections0: u32, + sections1: u32, + sections2: u32, + sections3: u32 +}; + +@group(0) @binding(0) +var<storage, read_write> src0: array<{{TYPE}}>; + +@group(0) @binding(1) +var<storage, read_write> src1: array<i32>; + +DECLS + +fn rope_yarn_ramp(low: f32, high: f32, i: u32) -> f32 { + let y = (f32(i / 2) - low) / max(0.001f, high - low); + return 1.0f - min(1.0f, max(0.0f, y)); +} + +// returns vector of (cos_theta, sin_theta) +// TODO: check performance of instantiating once on the CPU and passed as buffer, since it's repeated per-row +fn rope_yarn(theta_extrap: f32, i: u32) -> vec2<f32> { + var mscale = params.attn_factor; + var theta = params.freq_scale * theta_extrap; + if (params.ext_factor != 0.0f) { + let ramp_mix = rope_yarn_ramp(params.corr_dim0, params.corr_dim1, i) * params.ext_factor; + theta = theta * (1 - ramp_mix) + theta_extrap * ramp_mix; + mscale *= 1.0f + 0.1f * log(1.0f / params.freq_scale); + } + return vec2<f32>(cos(theta) * mscale, sin(theta) * mscale); +} + +fn pair_base(i0: u32, div_2: bool) -> u32 { + if (div_2) { + return i0 / 2; + } else { + return i0; + } +} + +fn pair_offset(is_neox: bool, is_mrope: bool, is_vision: bool) -> u32 { + if (is_vision) { + return params.n_dims; + } else if (is_neox || is_mrope) { + return params.n_dims / 2; + } else { + return 1; + } +} + +override wg_size: u32; +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3<u32>) { + // two elements per thread + if (gid.x >= params.n_threads) { + return; + } + + let is_neox = bool(params.mode & 2); + let is_mrope = bool(params.mode & 8); + let is_imrope = params.mode == 40; + let is_vision = params.mode == 24; + + var i = gid.x * 2; // start index for this thread + let i3 = i / (params.ne2 * params.ne1 * params.ne0); + i = i % (params.ne2 * params.ne1 * params.ne0); + let i2 = i / (params.ne1 * params.ne0); + i = i % (params.ne1 * params.ne0); + let i1 = i / params.ne0; + let i0 = i % params.ne0; + + let i_src_row = params.offset_src0 + i3 * params.stride_src03 + i2 * params.stride_src02 + i1 * params.stride_src01; + let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1; + + if (i0 >= params.n_dims && !is_vision) { + let i_src = i_src_row + i0; + let i_dst = i_dst_row + i0; + rotate(i_dst, i_dst + 1, f32(src0[i_src]), f32(src0[i_src + 1])); + return; + } + + var theta_base_mult: u32 = 0; + var theta_scale_pwr: u32 = i0 / 2; + if (is_mrope) { + let sect_dims = params.sections0 + params.sections1 + params.sections2 + params.sections3; + let sec_w = params.sections1 + params.sections0; + let sec_e = params.sections2 + sec_w; + let sector = (i0 / 2) % sect_dims; + if (is_imrope) { + if (sector % 3 == 1 && sector < 3 * params.sections1) { + theta_base_mult = 1; + } else if (sector % 3 == 2 && sector < 3 * params.sections2) { + theta_base_mult = 2; + } else if (sector % 3 == 0 && sector < 3 * params.sections0) { + theta_base_mult = 0; + } else { + theta_base_mult = 3; + } + } else { + if (sector >= params.sections0 && sector < sec_w) { + theta_base_mult = 1; + if (is_vision) { + theta_scale_pwr = sector - params.sections0; + } + } else if (sector >= sec_w && sector < sec_e) { + theta_base_mult = 2; + if (is_vision) { + theta_scale_pwr = sector - sec_w; + } + } else if (sector >= sec_e) { + if (is_vision) { + theta_scale_pwr = sector - sec_e; + theta_scale_pwr = (i0 / 2) % sec_e; + } + theta_base_mult = 3; + } else if (is_vision) { + theta_scale_pwr = sector; + } + } + } + let theta_base = f32(src1[params.offset_src1 + i2 + params.ne2 * theta_base_mult]) * pow(params.theta_scale, f32(theta_scale_pwr)); + let thetas = rope_yarn(theta_base/freq_factor(i0), i0); + + let i_src = i_src_row + pair_base(i0, is_neox || is_mrope || is_vision); + let i_dst = i_dst_row + pair_base(i0, is_neox || is_mrope || is_vision); + + let x0 = f32(src0[i_src]); + let x1 = f32(src0[i_src + pair_offset(is_neox, is_mrope, is_vision)]); + rotate(i_dst, i_dst + pair_offset(is_neox, is_mrope, is_vision), x0 * thetas.x - x1 * thetas.y, x0 * thetas.y + x1 * thetas.x); +} + +#end(SHADER) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl new file mode 100644 index 0000000..040e80d --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl @@ -0,0 +1,90 @@ +#define(VARIANTS) + +[ + { + "SHADER_NAME": "scale_f32", + "DECLS": ["NOT_INPLACE"] + }, + { + "SHADER_NAME": "scale_f32_inplace", + "DECLS": ["INPLACE"] + } +] + +#end(VARIANTS) + +#define(DECLS) + +#decl(NOT_INPLACE) +@group(0) @binding(1) +var<storage, read_write> dst: array<f32>; + +@group(0) @binding(2) +var<uniform> params: Params; + +fn store_scale(val: f32, offset: u32) { + dst[offset] = val; +} +#enddecl(NOT_INPLACE) + +#decl(INPLACE) +@group(0) @binding(1) +var<uniform> params: Params; + +fn store_scale(val: f32, offset: u32) { + src[offset] = val; +} +#enddecl(INPLACE) + +#end(DECLS) + +#define(SHADER) + +struct Params { + offset_src: u32, + offset_dst: u32, + + // Strides (in elements) + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + ne: u32, + ne0: u32, + ne1: u32, + ne2: u32, + + scale: f32, + bias: f32 +}; + +@group(0) @binding(0) +var<storage, read_write> src: array<f32>; + +DECLS + +override wg_size: u32; +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3<u32>) { + if (gid.x >= params.ne) { + return; + } + + var i = gid.x; + let i3 = i / (params.ne2 * params.ne1 * params.ne0); + i = i % (params.ne2 * params.ne1 * params.ne0); + let i2 = i / (params.ne1 * params.ne0); + i = i % (params.ne1 * params.ne0); + let i1 = i / params.ne0; + let i0 = i % params.ne0; + + let i_src = params.offset_src + i3 * params.stride_src3 + i2 * params.stride_src2 + i1 * params.stride_src1 + i0; + let i_dst = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1 + i0; + + store_scale(src[i_src] * params.scale + params.bias, i_dst); +} +#end(SHADER) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl new file mode 100644 index 0000000..99e9192 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl @@ -0,0 +1,109 @@ +enable f16; + +#ifdef DST_F32 +#define DST_INNER_TYPE f32 +#else +#define DST_INNER_TYPE f16 +#endif + +#ifdef VEC4 +#define SRC_TYPE vec4<f32> +#define DST_TYPE vec4<DST_INNER_TYPE> +#define VEC_SIZE 4 +#else +#define SRC_TYPE f32 +#define DST_TYPE DST_INNER_TYPE +#define VEC_SIZE 1 +#endif + +@group(0) @binding(0) +var<storage, read_write> src: array<SRC_TYPE>; + +@group(0) @binding(1) +var<storage, read_write> idx: array<u32>; + +@group(0) @binding(2) +var<storage, read_write> dst: array<DST_TYPE>; + +#ifdef I64_IDX +@group(0) @binding(3) +var<storage, read_write> error: atomic<u32>; +#define PARAMS_BINDING 4 +#else +#define PARAMS_BINDING 3 +#endif + +struct Params { + offset_src: u32, // in elements + offset_idx: u32, // in elements + offset_dst: u32, // in elements + + // Strides (in elements) + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + stride_idx0: u32, + stride_idx1: u32, + stride_idx2: u32, + + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + // Shape of src + ne0: u32, + n_rows: u32, + ne2: u32, + ne3: u32, + + // Shape of idx + idx1: u32, + idx2: u32, +}; + +@group(0) @binding(PARAMS_BINDING) +var<uniform> params: Params; + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(global_invocation_id) gid: vec3<u32>) { + if (gid.x >= (params.ne3 * params.ne2 * params.n_rows * params.ne0) / VEC_SIZE) { + return; + } + + // getting the row from gid + let elems_per_row = params.ne0 / VEC_SIZE; + var i = gid.x / elems_per_row; + + let i_src3 = i / (params.ne2 * params.n_rows); + + i = i % (params.ne2 * params.n_rows); + let i_src2 = i / params.n_rows; + let i_src1 = i % params.n_rows; + + let i_idx2 = i_src3 % params.idx2; + let i_idx1 = i_src2 % params.idx1; + let i_idx0 = i_src1; + +#ifdef I64_IDX + let idx_high = (params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2) * 2; + + let idx_val = idx[idx_high]; + let idx_low_val = idx[idx_high + 1]; + + if (idx_low_val != 0) { + // Upper bits of index are not zero, output will be incorrect + atomicStore(&error, 1); + return; + } +#else + let idx_i = params.offset_idx + i_idx0 * params.stride_idx0 + i_idx1 * params.stride_idx1 + i_idx2 * params.stride_idx2; + let idx_val = idx[idx_i]; +#endif + + let i_dst_row = params.offset_dst + idx_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3; + let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3; + + let col_idx = (gid.x % elems_per_row); + dst[i_dst_row/VEC_SIZE + col_idx] = DST_TYPE(src[i_src_row/VEC_SIZE + col_idx]); +} diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl new file mode 100644 index 0000000..c74dc4c --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl @@ -0,0 +1,345 @@ +#define(VARIANTS) +[ + { + "SHADER_NAME": "soft_max_f32", + "DECLS": ["BASE_BINDINGS", "NOT_INPLACE", "NO_MASK", "NO_SINK"] + }, + { + "SHADER_NAME": "soft_max_f32_inplace", + "DECLS": ["BASE_BINDINGS_INPLACE", "INPLACE", "NO_MASK", "NO_SINK"] + }, + { + "SHADER_NAME": "soft_max_f32_sink", + "DECLS": ["SINK_BINDINGS", "NOT_INPLACE", "NO_MASK", "SINK"] + }, + { + "SHADER_NAME": "soft_max_f32_sink_inplace", + "DECLS": ["SINK_BINDINGS_INPLACE", "INPLACE", "NO_MASK", "SINK"] + }, + { + "SHADER_NAME": "soft_max_f32_mask_f32", + "REPLS": { + "MASK_TYPE" : "f32", + }, + "DECLS": ["MASK_BINDINGS", "NOT_INPLACE", "MASK", "NO_SINK"] + }, + { + "SHADER_NAME": "soft_max_f32_mask_f32_inplace", + "REPLS": { + "MASK_TYPE" : "f32", + }, + "DECLS": ["MASK_BINDINGS_INPLACE", "INPLACE", "MASK", "NO_SINK"] + }, + { + "SHADER_NAME": "soft_max_f32_mask_f16", + "REPLS": { + "MASK_TYPE" : "f16", + }, + "DECLS": ["MASK_BINDINGS", "NOT_INPLACE", "MASK", "NO_SINK"] + }, + { + "SHADER_NAME": "soft_max_f32_mask_f16_inplace", + "REPLS": { + "MASK_TYPE" : "f16", + }, + "DECLS": ["MASK_BINDINGS_INPLACE", "INPLACE", "MASK", "NO_SINK"] + }, + { + "SHADER_NAME": "soft_max_f32_mask_f32_sink", + "REPLS": { + "MASK_TYPE" : "f32", + }, + "DECLS": ["MASK_SINK_BINDINGS", "NOT_INPLACE", "MASK", "SINK"] + }, + { + "SHADER_NAME": "soft_max_f32_mask_f32_sink_inplace", + "REPLS": { + "MASK_TYPE" : "f32", + }, + "DECLS": ["MASK_SINK_BINDINGS_INPLACE", "INPLACE", "MASK", "SINK"] + }, + { + "SHADER_NAME": "soft_max_f32_mask_f16_sink", + "REPLS": { + "MASK_TYPE" : "f16", + }, + "DECLS": ["MASK_SINK_BINDINGS", "NOT_INPLACE", "MASK", "SINK"] + }, + { + "SHADER_NAME": "soft_max_f32_mask_f16_sink_inplace", + "REPLS": { + "MASK_TYPE" : "f16", + }, + "DECLS": ["MASK_SINK_BINDINGS_INPLACE", "INPLACE", "MASK", "SINK"] + } +] +#end(VARIANTS) + +#define(DECLS) + +#decl(BASE_BINDINGS) +@group(0) @binding(1) +var<storage, read_write> dst: array<f32>; + +@group(0) @binding(2) +var<uniform> params: Params; +#enddecl(BASE_BINDINGS) + +#decl(BASE_BINDINGS_INPLACE) +@group(0) @binding(1) +var<uniform> params: Params; +#enddecl(BASE_BINDINGS_INPLACE) + +#decl(SINK_BINDINGS) +@group(0) @binding(1) +var<storage, read_write> sinks: array<f32>; + +@group(0) @binding(2) +var<storage, read_write> dst: array<f32>; + +@group(0) @binding(3) +var<uniform> params: Params; +#enddecl(SINK_BINDINGS) + +#decl(SINK_BINDINGS_INPLACE) +@group(0) @binding(1) +var<storage, read_write> sinks: array<f32>; + +@group(0) @binding(2) +var<uniform> params: Params; +#enddecl(SINK_BINDINGS_INPLACE) + +#decl(MASK_BINDINGS) +@group(0) @binding(1) +var<storage, read_write> mask: array<{{MASK_TYPE}}>; + +@group(0) @binding(2) +var<storage, read_write> dst: array<f32>; + +@group(0) @binding(3) +var<uniform> params: Params; +#enddecl(MASK_BINDINGS) + +#decl(MASK_BINDINGS_INPLACE) +@group(0) @binding(1) +var<storage, read_write> mask: array<{{MASK_TYPE}}>; + +@group(0) @binding(2) +var<uniform> params: Params; +#enddecl(MASK_BINDINGS_INPLACE) + +#decl(MASK_SINK_BINDINGS) +@group(0) @binding(1) +var<storage, read_write> mask: array<{{MASK_TYPE}}>; + +@group(0) @binding(2) +var<storage, read_write> sinks: array<f32>; + +@group(0) @binding(3) +var<storage, read_write> dst: array<f32>; + +@group(0) @binding(4) +var<uniform> params: Params; +#enddecl(MASK_SINK_BINDINGS) + +#decl(MASK_SINK_BINDINGS_INPLACE) +@group(0) @binding(1) +var<storage, read_write> mask: array<{{MASK_TYPE}}>; + +@group(0) @binding(2) +var<storage, read_write> sinks: array<f32>; + +@group(0) @binding(3) +var<uniform> params: Params; +#enddecl(MASK_SINK_BINDINGS_INPLACE) + +#decl(NOT_INPLACE) +fn inter_value(i: u32) -> f32 { + return dst[i]; +} + +fn update(i: u32, val: f32) { + dst[i] = val; +} +#enddecl(NOT_INPLACE) + +#decl(INPLACE) +fn inter_value(i: u32) -> f32 { + return src[i]; +} + +fn update(i: u32, val: f32) { + src[i] = val; +} +#enddecl(INPLACE) + +#decl(NO_MASK) +fn mask_val(i: u32) -> f32 { + return 0.0; +} +#enddecl(NO_MASK) + +#decl(MASK) +fn mask_val(i: u32) -> f32 { + return f32(mask[i]); +} +#enddecl(MASK) + +#decl(NO_SINK) +fn lower_max_bound(i2: u32) -> f32 { + return -1e30; +} + +fn add_sinks(val: f32, i2: u32, max_val: f32) -> f32 { + return val; +} +#enddecl(NO_SINK) + +#decl(SINK) +fn lower_max_bound(i2: u32) -> f32 { + return sinks[params.offset_sinks + i2]; +} + +fn add_sinks(val: f32, i2: u32, max_val: f32) -> f32 { + return val + exp(sinks[params.offset_sinks + i2] - max_val); +} +#enddecl(SINK) + +#end(DECLS) + +#define(SHADER) +enable f16; + +struct Params { + offset_src0: u32, + offset_src1: u32, + offset_sinks: u32, + offset_dst: u32, + + // Strides (in elements) + stride_src01: u32, + stride_src02: u32, + stride_src03: u32, + + stride_src11: u32, + stride_src12: u32, + stride_src13: u32, + + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + // shape of src0/dst + ne: u32, + ne0: u32, + ne1: u32, + ne2: u32, + + // shape of src1 + ne12: u32, + ne13: u32, + + scale: f32, + max_bias: f32, + n_head_log2: f32, + m0: f32, + m1: f32, +}; + +@group(0) @binding(0) +var<storage, read_write> src: array<f32>; + +DECLS + +const CACHE_SIZE: u32 = 16; + +override wg_size: u32; +var<workgroup> scratch: array<f32, wg_size>; + +@compute @workgroup_size(wg_size) +fn main(@builtin(workgroup_id) wid: vec3<u32>, + @builtin(local_invocation_id) lid: vec3<u32>) { + + var i = wid.x; + let i3 = i / (params.ne2 * params.ne1); + i = i % (params.ne2 * params.ne1); + let i2 = i / params.ne1; + let i1 = i % params.ne1; + let i_src0_row = params.offset_src0 + i3 * params.stride_src03 + i2 * params.stride_src02 + i1 * params.stride_src01; + let i_src1_row = params.offset_src1 + (i3 % params.ne13) * params.stride_src13 + (i2 % params.ne12) * params.stride_src12 + i1 * params.stride_src11; + let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1; + let elems = (params.ne0 + wg_size - 1) / wg_size; + + let head = f32(i2); + let slope = select(1, select(pow(params.m1, 2 * (head - params.n_head_log2) + 1), pow(params.m0, head + 1), head < params.n_head_log2), params.max_bias > 0); + + var cache: array<f32, CACHE_SIZE>; + + var max_val = lower_max_bound(i2); + var col = lid.x; + for (var j: u32 = 0; j < elems; j++) { + if (col >= params.ne0) { + break; + } + let val = src[i_src0_row + col] * params.scale + slope * mask_val(i_src1_row + col); + max_val = max(max_val, val); + if (col < CACHE_SIZE) { + cache[col] = val; + } + col += wg_size; + } + + scratch[lid.x] = max_val; + workgroupBarrier(); + var offset = wg_size / 2; + while (offset > 0) { + if (lid.x < offset) { + scratch[lid.x] = max(scratch[lid.x], scratch[lid.x + offset]); + } + offset = offset / 2; + workgroupBarrier(); + } + let row_max = scratch[0]; + workgroupBarrier(); + + var sum = 0.0f; + col = lid.x; + for (var j: u32 = 0; j < elems; j++) { + if (col >= params.ne0) { + break; + } + let val = select(src[i_src0_row + col] * params.scale + slope * mask_val(i_src1_row + col), + cache[col], col < CACHE_SIZE); + let ex = exp(val - row_max); + sum += ex; + if (col < CACHE_SIZE) { + cache[col] = ex; + } else { + update(i_dst_row + col, ex); + } + col += wg_size; + } + + scratch[lid.x] = sum; + workgroupBarrier(); + offset = wg_size / 2; + while (offset > 0) { + if (lid.x < offset) { + scratch[lid.x] += scratch[lid.x + offset]; + } + offset = offset / 2; + workgroupBarrier(); + } + let row_sum = add_sinks(scratch[0], i2, row_max); + + let sum_recip = 1.0 / row_sum; + col = lid.x; + for (var j: u32 = 0; j < elems; j++) { + if (col >= params.ne0) { + break; + } + update(i_dst_row + col, select(inter_value(i_dst_row + col), cache[col], col < CACHE_SIZE) * sum_recip); + col += wg_size; + } +} +#end(SHADER) diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl new file mode 100644 index 0000000..6ea2de9 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl @@ -0,0 +1,55 @@ +@group(0) @binding(0) +var<storage, read_write> src: array<f32>; + +@group(0) @binding(1) +var<storage, read_write> dst: array<f32>; + +struct Params { + offset_src: u32, // in elements + offset_dst: u32, // in elements + + // Strides (in elements) + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + ne0: u32, + ne1: u32, + ne2: u32 +}; + +@group(0) @binding(2) +var<uniform> params: Params; + +var<workgroup> shared_sum: array<f32, WG_SIZE>; + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(workgroup_id) wid: vec3<u32>, + @builtin(local_invocation_id) lid: vec3<u32>) { + + var i = wid.x; + let i3 = i / (params.ne2 * params.ne1); + i = i % (params.ne2 * params.ne1); + let i2 = i / params.ne1; + let i1 = i % params.ne1; + let i_src_row = params.offset_src + i3 * params.stride_src3 + i2 * params.stride_src2 + i1 * params.stride_src1; + var local_sum: f32 = 0.0; + for (var col = lid.x; col < params.ne0; col += WG_SIZE) { + local_sum += src[i_src_row + col]; + } + shared_sum[lid.x] = local_sum; + workgroupBarrier(); + // reduce within workgroup + var offset: u32 = WG_SIZE >> 1; + while (offset > 0) { + if (lid.x < offset) { + shared_sum[lid.x] = shared_sum[lid.x] + shared_sum[lid.x + offset]; + } + workgroupBarrier(); + offset >>= 1; + } + + if (lid.x == 0) { + dst[params.offset_dst + wid.x] = shared_sum[0]; + } +} diff --git a/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl new file mode 100644 index 0000000..d639d98 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl @@ -0,0 +1,179 @@ +#ifdef TYPE_F16 +enable f16; +#define TYPE f16 +#else +#define TYPE f32 +#endif + + +@group(0) @binding(0) +var<storage, read_write> src: array<TYPE>; + +#ifndef INPLACE +@group(0) @binding(1) +var<storage, read_write> dst: array<TYPE>; +#define PARAMS_BINDING 2 +#else +#define PARAMS_BINDING 1 +#endif + +struct Params { + ne: u32, // total number of elements + offset_src: u32, // in elements + offset_dst: u32, // in elements + + // Strides (in elements) + stride_src0: u32, + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + // Logical shapes + ne0: u32, + ne1: u32, + ne2: u32, +#ifdef CLAMP + clamp_min: f32, + clamp_max: f32, +#endif +#ifdef FILL + fill_val: f32, +#endif +#ifdef XIELU + alpha_n: f32, + alpha_p: f32, + beta: f32, + eps: f32, +#endif + +}; + +@group(0) @binding(PARAMS_BINDING) +var<uniform> params: Params; + +@compute @workgroup_size(WG_SIZE) +fn main(@builtin(global_invocation_id) gid: vec3<u32>) { + if (gid.x >= params.ne) { + return; + } + var i = gid.x; + let i3 = i / (params.ne2 * params.ne1 * params.ne0); + i = i % (params.ne2 * params.ne1 * params.ne0); + let i2 = i / (params.ne1 * params.ne0); + i = i % (params.ne1 * params.ne0); + let i1 = i / params.ne0; + let i0 = i % params.ne0; + + let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 + + i2 * params.stride_src2 + i3 * params.stride_src3; + +#ifdef ABS + let res = abs(src[params.offset_src + src_idx]); +#endif +#ifdef SGN + let res = select(TYPE(select(0.0, -1.0, src[params.offset_src + src_idx] < 0.0)), TYPE(1.0), + src[params.offset_src + src_idx] > 0.0); +#endif +#ifdef NEG + let res = -src[params.offset_src + src_idx]; +#endif +#ifdef STEP + let res = TYPE(select(0.0, 1.0, src[params.offset_src + src_idx] > 0.0)); +#endif +#ifdef TANH + let res = tanh(clamp(src[params.offset_src + src_idx], -9.010913, 9.010913)); +#endif +#ifdef RELU + let res = select(0.0, src[params.offset_src + src_idx], src[params.offset_src + src_idx] > 0.0); +#endif +#ifdef ELU + let res = select(exp(src[params.offset_src + src_idx]) - 1.0, src[params.offset_src + src_idx], + src[params.offset_src + src_idx] > 0.0); +#endif +#ifdef HARDSIGMOID + let res = min(1.0, max(0.0, (src[params.offset_src + src_idx] + 3.0) / 6.0)); +#endif +#ifdef SIGMOID + let res = 1.0 / (1.0 + exp(-src[params.offset_src + src_idx])); +#endif +#ifdef SILU + let res = src[params.offset_src + src_idx] / (1.0 + exp(-src[params.offset_src + src_idx])); +#endif +#ifdef EXP + let res = exp(src[params.offset_src + src_idx]); +#endif +#ifdef LOG + let res = TYPE(log(f32(src[params.offset_src + src_idx]))); +#endif +#ifdef CLAMP + let res = clamp(src[params.offset_src + src_idx], TYPE(params.clamp_min), TYPE(params.clamp_max)); +#endif +#ifdef FILL + let res = TYPE(params.fill_val); +#endif +#ifdef HARDSWISH + let res = src[params.offset_src + src_idx] * + min(1.0, max(0.0, (src[params.offset_src + src_idx] + 3.0) / 6.0)); +#endif +#ifdef GELU + let res = 0.5 * src[params.offset_src + src_idx] * + (1.0 + tanh(clamp(sqrt(2.0 / 3.14159265) * + (src[params.offset_src + src_idx] + + 0.044715 * pow(src[params.offset_src + src_idx], 3.0)), + -9.010913, 9.010913))); +#endif +#ifdef GELU_QUICK + let res = src[params.offset_src + src_idx] * 0.5 * + (1.0 + tanh(clamp(0.79788456 * + (src[params.offset_src + src_idx] + + 0.044715 * src[params.offset_src + src_idx] * + src[params.offset_src + src_idx] * src[params.offset_src + src_idx]), + -9.010913, 9.010913))); +#endif +#ifdef GELU_ERF + let res = 0.5 * src[params.offset_src + src_idx] * + (1.0 + tanh(clamp(0.79788456 * + (src[params.offset_src + src_idx] + + 0.044715 * src[params.offset_src + src_idx] * + src[params.offset_src + src_idx] * src[params.offset_src + src_idx]), + -9.010913, 9.010913))); +#endif +#ifdef XIELU + let res = + select(((exp(min(src[params.offset_src + src_idx], TYPE(params.eps))) - 1.0) - + src[params.offset_src + src_idx]) * + TYPE(params.alpha_n) + + TYPE(params.beta) * src[params.offset_src + src_idx], + TYPE(params.alpha_p) * src[params.offset_src + src_idx] * + src[params.offset_src + src_idx] + + TYPE(params.beta) * src[params.offset_src + src_idx], + src[params.offset_src + src_idx] > 0.0); +#endif +#ifdef SOFTPLUS + let src_f32 = f32(src[params.offset_src + src_idx]); + let res = TYPE(select(log(1.0 + exp(src_f32)), src_f32, src_f32 > 20.0)); +#endif +#ifdef EXPM1 + let res = exp(src[params.offset_src + src_idx]) - 1.0; +#endif +#ifdef FLOOR + let res = floor(src[params.offset_src + src_idx]); +#endif +#ifdef CEIL + let res = ceil(src[params.offset_src + src_idx]); +#endif +#ifdef ROUND + let src_f32 = f32(src[params.offset_src + src_idx]); + let result = select(ceil(src_f32 - 0.5), floor(src_f32 + 0.5), src_f32 >= 0.0); + let res = TYPE(result); +#endif +#ifdef TRUNC + let res = trunc(src[params.offset_src + src_idx]); +#endif + +#ifdef INPLACE + src[params.offset_src + src_idx] = res; +#else + dst[params.offset_dst + gid.x] = res; +#endif +} |
