1#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
3#ifdef cl_intel_subgroups
4#pragma OPENCL EXTENSION cl_intel_subgroups : enable
5#else
6#pragma OPENCL EXTENSION cl_khr_subgroups : enable
7#endif
8
9#ifdef cl_intel_required_subgroup_size
10#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11#define INTEL_GPU 1
12#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14#elif defined(cl_qcom_reqd_sub_group_size)
15#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16#define ADRENO_GPU 1
17#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19#endif
20
21#define QK4_0 32
22#define QR4_0 2
23#define QK4_1 32
24#define QR4_1 2
25#define QK5_0 32
26#define QR5_0 2
27#define QK5_1 32
28#define QR5_1 2
29#define QK8_0 32
30#define QR8_0 1
31#define QK_K 256
32#define K_QUANTS_PER_ITERATION 2
33
34typedef char int8_t;
35typedef uchar uint8_t;
36typedef short int16_t;
37typedef ushort uint16_t;
38typedef int int32_t;
39typedef uint uint32_t;
40
41//------------------------------------------------------------------------------
42// block_q4_0
43//------------------------------------------------------------------------------
44struct block_q4_0
45{
46 half d;
47 uint8_t qs[QK4_0 / 2];
48};
49
50//
51// This variant unrolls the loops and uses vector types instead of pointers.
52// It improves performance on Adreno but not so much on Intel.
53//
54inline float block_q_4_0_dot_y_v(
55 global struct block_q4_0 * qb_curr,
56 float sumy,
57 float16 yl,
58 int il
59) {
60 float d = qb_curr->d;
61 float acc = 0.f;
62 global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
63
64 acc += yl.s0 * (qs[0] & 0x000F);
65 acc += yl.s1 * (qs[0] & 0x0F00);
66 acc += yl.s8 * (qs[0] & 0x00F0);
67 acc += yl.s9 * (qs[0] & 0xF000);
68
69 acc += yl.s2 * (qs[1] & 0x000F);
70 acc += yl.s3 * (qs[1] & 0x0F00);
71 acc += yl.sa * (qs[1] & 0x00F0);
72 acc += yl.sb * (qs[1] & 0xF000);
73
74 acc += yl.s4 * (qs[2] & 0x000F);
75 acc += yl.s5 * (qs[2] & 0x0F00);
76 acc += yl.sc * (qs[2] & 0x00F0);
77 acc += yl.sd * (qs[2] & 0xF000);
78
79 acc += yl.s6 * (qs[3] & 0x000F);
80 acc += yl.s7 * (qs[3] & 0x0F00);
81 acc += yl.se * (qs[3] & 0x00F0);
82 acc += yl.sf * (qs[3] & 0xF000);
83
84 return d * (sumy * -8.f + acc);
85}
86
87#undef N_DST
88#undef N_SIMDGROUP
89#undef N_SIMDWIDTH
90
91#ifdef INTEL_GPU
92#define N_DST 4 // each SIMD group works on 4 rows
93#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
94#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
95#elif defined (ADRENO_GPU)
96#define N_DST 4
97#define N_SIMDGROUP 1
98#define N_SIMDWIDTH 64
99#endif
100
101inline void mul_vec_q_n_f32_v(
102 global void * src0,
103 global float * src1,
104 global float * dst,
105 int ne00,
106 int ne01,
107 int ne02,
108 int ne10,
109 int ne12,
110 int ne0,
111 int ne1,
112 int r2,
113 int r3
114) {
115 const ulong nb = ne00/QK4_0;
116
117 int r0 = get_group_id(0);
118 int r1 = get_group_id(1);
119 int im = get_group_id(2);
120
121 // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
122 // id of a SIMD group in the grid.
123 int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
124
125 int i12 = im%ne12;
126 int i13 = im/ne12;
127
128 ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
129
130 global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
131 global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
132
133 float16 yl; // src1 vector cache
134 float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
135
136 int ix = get_sub_group_local_id()/2;
137 int il = 8*(get_sub_group_local_id()%2);
138
139 global float * yb = y + ix * QK4_0 + il;
140
141 // each thread in a SIMD group deals with half a block.
142 for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
143 float sumy = 0;
144
145 sumy += yb[0];
146 sumy += yb[1];
147 sumy += yb[2];
148 sumy += yb[3];
149 sumy += yb[4];
150 sumy += yb[5];
151 sumy += yb[6];
152 sumy += yb[7];
153
154 sumy += yb[16];
155 sumy += yb[17];
156 sumy += yb[18];
157 sumy += yb[19];
158 sumy += yb[20];
159 sumy += yb[21];
160 sumy += yb[22];
161 sumy += yb[23];
162
163
164 yl.s0 = yb[0];
165 yl.s1 = yb[1]/256.f;
166
167 yl.s2 = yb[2];
168 yl.s3 = yb[3]/256.f;
169
170 yl.s4 = yb[4];
171 yl.s5 = yb[5]/256.f;
172
173 yl.s6 = yb[6];
174 yl.s7 = yb[7]/256.f;
175
176 yl.s8 = yb[16]/16.f;
177 yl.s9 = yb[17]/4096.f;
178
179 yl.sa = yb[18]/16.f;
180 yl.sb = yb[19]/4096.f;
181
182 yl.sc = yb[20]/16.f;
183 yl.sd = yb[21]/4096.f;
184
185 yl.se = yb[22]/16.f;
186 yl.sf = yb[23]/4096.f;
187
188 sumf.s0 += block_q_4_0_dot_y_v(x+ib+0*nb, sumy, yl, il);
189 sumf.s1 += block_q_4_0_dot_y_v(x+ib+1*nb, sumy, yl, il);
190 sumf.s2 += block_q_4_0_dot_y_v(x+ib+2*nb, sumy, yl, il);
191 sumf.s3 += block_q_4_0_dot_y_v(x+ib+3*nb, sumy, yl, il);
192
193 // One thread in a SIMD group (i.e., subgroup) handles a half block,
194 // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
195 // y points to the activation matrix (of type float). Therefore for
196 // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
197 // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
198 // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
199 yb += QK4_0 * (N_SIMDWIDTH/2);
200 }
201
202 // The above does not work for Adreno - it produces incorrect results for
203 // row = 1, 2, 3 and only row = 0 gives the correct result.
204 // If N_DST is changed, the below array must be initialized accordingly.
205 // This also seems to perform better on Intel.
206 float4 tot = (float4)(
207 sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
208 sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
209 );
210
211 if (get_sub_group_local_id() == 0) {
212 if (first_row + 0 < ne01) {
213 dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
214 }
215 if (first_row + 1 < ne01) {
216 dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
217 }
218 if (first_row + 2 < ne01) {
219 dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
220 }
221 if (first_row + 3 < ne01) {
222 dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
223 }
224 }
225}
226
227#ifdef INTEL_GPU
228REQD_SUBGROUP_SIZE_16
229#elif defined (ADRENO_GPU)
230REQD_SUBGROUP_SIZE_64
231#endif
232kernel void kernel_mul_mat_q4_0_f32_v(
233 global void * src0,
234 ulong offset0,
235 global float * src1,
236 ulong offset1,
237 global float * dst,
238 ulong offsetd,
239 int ne00,
240 int ne01,
241 int ne02,
242 int ne10,
243 int ne12,
244 int ne0,
245 int ne1,
246 int r2,
247 int r3
248) {
249 src0 = (global void*)((global char*)src0 + offset0);
250 src1 = (global float*)((global char*)src1 + offset1);
251 dst = (global float*)((global char*)dst + offsetd);
252
253 mul_vec_q_n_f32_v(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
254}