1#pragma OPENCL EXTENSION cl_khr_fp16 : enable
  2
  3#ifdef cl_intel_subgroups
  4#pragma OPENCL EXTENSION cl_intel_subgroups : enable
  5#else
  6#pragma OPENCL EXTENSION cl_khr_subgroups : enable
  7#endif
  8
  9#ifdef cl_intel_required_subgroup_size
 10#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 11#define INTEL_GPU 1
 12#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 13#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 14#elif defined(cl_qcom_reqd_sub_group_size)
 15#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 16#define ADRENO_GPU 1
 17#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 18#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 19#endif
 20
 21#define QK4_0                   32
 22#define QR4_0                   2
 23#define QK4_1                   32
 24#define QR4_1                   2
 25#define QK5_0                   32
 26#define QR5_0                   2
 27#define QK5_1                   32
 28#define QR5_1                   2
 29#define QK8_0                   32
 30#define QR8_0                   1
 31#define QK_K                    256
 32#define K_QUANTS_PER_ITERATION  2
 33
 34typedef char int8_t;
 35typedef uchar uint8_t;
 36typedef short int16_t;
 37typedef ushort uint16_t;
 38typedef int int32_t;
 39typedef uint uint32_t;
 40
 41//------------------------------------------------------------------------------
 42// block_q4_0
 43//------------------------------------------------------------------------------
 44struct block_q4_0
 45{
 46    half d;
 47    uint8_t qs[QK4_0 / 2];
 48};
 49
 50//
 51// This variant unrolls the loops and uses vector types instead of pointers.
 52// It improves performance on Adreno but not so much on Intel.
 53//
 54inline float block_q_4_0_dot_y_v(
 55        global struct block_q4_0 * qb_curr,
 56        float sumy,
 57        float16 yl,
 58        int il
 59) {
 60    float d = qb_curr->d;
 61    float acc = 0.f;
 62    global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
 63
 64    acc += yl.s0 * (qs[0] & 0x000F);
 65    acc += yl.s1 * (qs[0] & 0x0F00);
 66    acc += yl.s8 * (qs[0] & 0x00F0);
 67    acc += yl.s9 * (qs[0] & 0xF000);
 68
 69    acc += yl.s2 * (qs[1] & 0x000F);
 70    acc += yl.s3 * (qs[1] & 0x0F00);
 71    acc += yl.sa * (qs[1] & 0x00F0);
 72    acc += yl.sb * (qs[1] & 0xF000);
 73
 74    acc += yl.s4 * (qs[2] & 0x000F);
 75    acc += yl.s5 * (qs[2] & 0x0F00);
 76    acc += yl.sc * (qs[2] & 0x00F0);
 77    acc += yl.sd * (qs[2] & 0xF000);
 78
 79    acc += yl.s6 * (qs[3] & 0x000F);
 80    acc += yl.s7 * (qs[3] & 0x0F00);
 81    acc += yl.se * (qs[3] & 0x00F0);
 82    acc += yl.sf * (qs[3] & 0xF000);
 83
 84    return d * (sumy * -8.f + acc);
 85}
 86
 87#undef N_DST
 88#undef N_SIMDGROUP
 89#undef N_SIMDWIDTH
 90
 91#ifdef INTEL_GPU
 92#define N_DST 4 // each SIMD group works on 4 rows
 93#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
 94#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
 95#elif defined (ADRENO_GPU)
 96#define N_DST 4
 97#define N_SIMDGROUP 1
 98#define N_SIMDWIDTH 64
 99#endif
100
101inline void mul_vec_q_n_f32_v(
102        global void * src0,
103        global float * src1,
104        global float * dst,
105        int ne00,
106        int ne01,
107        int ne02,
108        int ne10,
109        int ne12,
110        int ne0,
111        int ne1,
112        int r2,
113        int r3
114) {
115    const ulong nb = ne00/QK4_0;
116
117    int r0 = get_group_id(0);
118    int r1 = get_group_id(1);
119    int im = get_group_id(2);
120
121    // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
122    // id of a SIMD group in the grid.
123    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
124
125    int i12 = im%ne12;
126    int i13 = im/ne12;
127
128    ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
129
130    global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
131    global float             * y = (global float             *) src1 + r1*ne10 + im*ne00*ne1;
132
133    float16 yl;       // src1 vector cache
134    float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
135
136    int ix = get_sub_group_local_id()/2;
137    int il = 8*(get_sub_group_local_id()%2);
138
139    global float * yb = y + ix * QK4_0 + il;
140
141    // each thread in a SIMD group deals with half a block.
142    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
143        float sumy = 0;
144
145        sumy += yb[0];
146        sumy += yb[1];
147        sumy += yb[2];
148        sumy += yb[3];
149        sumy += yb[4];
150        sumy += yb[5];
151        sumy += yb[6];
152        sumy += yb[7];
153
154        sumy += yb[16];
155        sumy += yb[17];
156        sumy += yb[18];
157        sumy += yb[19];
158        sumy += yb[20];
159        sumy += yb[21];
160        sumy += yb[22];
161        sumy += yb[23];
162
163
164        yl.s0 = yb[0];
165        yl.s1 = yb[1]/256.f;
166
167        yl.s2 = yb[2];
168        yl.s3 = yb[3]/256.f;
169
170        yl.s4 = yb[4];
171        yl.s5 = yb[5]/256.f;
172
173        yl.s6 = yb[6];
174        yl.s7 = yb[7]/256.f;
175
176        yl.s8 = yb[16]/16.f;
177        yl.s9 = yb[17]/4096.f;
178
179        yl.sa = yb[18]/16.f;
180        yl.sb = yb[19]/4096.f;
181
182        yl.sc = yb[20]/16.f;
183        yl.sd = yb[21]/4096.f;
184
185        yl.se = yb[22]/16.f;
186        yl.sf = yb[23]/4096.f;
187
188        sumf.s0 += block_q_4_0_dot_y_v(x+ib+0*nb, sumy, yl, il);
189        sumf.s1 += block_q_4_0_dot_y_v(x+ib+1*nb, sumy, yl, il);
190        sumf.s2 += block_q_4_0_dot_y_v(x+ib+2*nb, sumy, yl, il);
191        sumf.s3 += block_q_4_0_dot_y_v(x+ib+3*nb, sumy, yl, il);
192
193        // One thread in a SIMD group (i.e., subgroup) handles a half block,
194        // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
195        // y points to the activation matrix (of type float). Therefore for
196        // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
197        // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
198        // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
199        yb += QK4_0 * (N_SIMDWIDTH/2);
200    }
201
202    // The above does not work for Adreno - it produces incorrect results for
203    // row = 1, 2, 3 and only row = 0 gives the correct result.
204    // If N_DST is changed, the below array must be initialized accordingly.
205    // This also seems to perform better on Intel.
206    float4 tot = (float4)(
207        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
208        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
209    );
210
211    if (get_sub_group_local_id() == 0) {
212        if (first_row + 0 < ne01) {
213            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
214        }
215        if (first_row + 1 < ne01) {
216            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
217        }
218        if (first_row + 2 < ne01) {
219            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
220        }
221        if (first_row + 3 < ne01) {
222            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
223        }
224    }
225}
226
227#ifdef INTEL_GPU
228REQD_SUBGROUP_SIZE_16
229#elif defined (ADRENO_GPU)
230REQD_SUBGROUP_SIZE_64
231#endif
232kernel void kernel_mul_mat_q4_0_f32_v(
233        global void * src0,
234        ulong offset0,
235        global float * src1,
236        ulong offset1,
237        global float * dst,
238        ulong offsetd,
239        int ne00,
240        int ne01,
241        int ne02,
242        int ne10,
243        int ne12,
244        int ne0,
245        int ne1,
246        int r2,
247        int r3
248) {
249    src0 = (global void*)((global char*)src0 + offset0);
250    src1 = (global float*)((global char*)src1 + offset1);
251    dst = (global float*)((global char*)dst + offsetd);
252
253    mul_vec_q_n_f32_v(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
254}