1#extension GL_EXT_shader_16bit_storage : require
2#extension GL_EXT_control_flow_attributes : require
3
4layout (push_constant) uniform parameter
5{
6 uint ne;
7 uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
8 uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
9 uint misalign_offsets;
10 float param1; float param2;
11
12 uint ne0_012mp; uint ne0_012L;
13 uint ne0_01mp; uint ne0_01L;
14 uint ne0_0mp; uint ne0_0L;
15 uint ne1_012mp; uint ne1_012L;
16 uint ne1_01mp; uint ne1_01L;
17 uint ne1_0mp; uint ne1_0L;
18} p;
19
20layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
21#if defined(A_TYPE_PACKED16)
22layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
23#endif
24#if defined(A_TYPE_PACKED32)
25layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
26#endif
27
28layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
29
30uint get_idx() {
31 return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
32}
33
34uint get_aoffset() { return p.misalign_offsets >> 16; }
35uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
36
37// see init_fastdiv_values in ggml-vulkan.cpp
38uint fastdiv(uint n, uint mp, uint L) {
39 uint msbs, lsbs;
40 // msbs = mulhi(n, mp)
41 umulExtended(n, mp, msbs, lsbs);
42 return (msbs + n) >> L;
43}
44
45uint src0_idx(uint idx) {
46 const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
47 const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
48 const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
49 const uint i02_offset = i02*p.ne01*p.ne00;
50 const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
51 const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
52 return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
53}
54
55uint dst_idx(uint idx) {
56 const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
57 const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
58 const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
59 const uint i12_offset = i12*p.ne11*p.ne10;
60 const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
61 const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
62 return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
63}
64
65uint src0_idx_quant(uint idx, uint qk) {
66 const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
67 const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
68 const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
69 const uint i02_offset = i02*p.ne01*p.ne00;
70 const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
71 const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
72 return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + (i00/qk)*p.nb00;
73}
74
75uint dst_idx_quant(uint idx, uint qk) {
76 const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
77 const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
78 const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
79 const uint i12_offset = i12*p.ne11*p.ne10;
80 const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
81 const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
82 return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + (i10/qk)*p.nb10;
83}