1#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
3#ifdef cl_intel_subgroups
4#pragma OPENCL EXTENSION cl_intel_subgroups : enable
5#else
6#pragma OPENCL EXTENSION cl_khr_subgroups : enable
7#endif
8
9#ifdef cl_intel_required_subgroup_size
10#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11#define INTEL_GPU 1
12#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14#elif defined(cl_qcom_reqd_sub_group_size)
15#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16#define ADRENO_GPU 1
17#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19#endif
20
21#define QK4_0 32
22#define QR4_0 2
23#define QK4_1 32
24#define QR4_1 2
25#define QK5_0 32
26#define QR5_0 2
27#define QK5_1 32
28#define QR5_1 2
29#define QK8_0 32
30#define QR8_0 1
31#define QK_K 256
32#define K_QUANTS_PER_ITERATION 2
33
34typedef char int8_t;
35typedef uchar uint8_t;
36typedef short int16_t;
37typedef ushort uint16_t;
38typedef int int32_t;
39typedef uint uint32_t;
40
41//------------------------------------------------------------------------------
42// block_q4_0
43//------------------------------------------------------------------------------
44struct block_q4_0
45{
46 half d;
47 uint8_t qs[QK4_0 / 2];
48};
49
50//------------------------------------------------------------------------------
51// mul_vec_q_n_f32
52//------------------------------------------------------------------------------
53// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
54// il indicates where the q4 quants begin (0 or QK4_0/4)
55// we assume that the yl's have been multiplied with the appropriate scale factor
56// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
57inline float block_q_4_0_dot_y(
58 global struct block_q4_0 * qb_curr,
59 float sumy,
60 private float * yl,
61 int il
62) {
63 float d = qb_curr->d;
64 float2 acc = 0.f;
65 global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
66 for (int i = 0; i < 8; i+=2) {
67 acc.s0 += yl[i + 0] * (qs[i / 2] & 0x000F)
68 + yl[i + 1] * (qs[i / 2] & 0x0F00);
69 acc.s1 += yl[i + 8] * (qs[i / 2] & 0x00F0)
70 + yl[i + 9] * (qs[i / 2] & 0xF000);
71 }
72 return d * (sumy * -8.f + acc.s0 + acc.s1);
73}
74
75#ifdef INTEL_GPU
76#define N_DST 4 // each SIMD group works on 4 rows
77#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
78#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
79#elif defined (ADRENO_GPU)
80#define N_DST 4
81#define N_SIMDGROUP 1
82#define N_SIMDWIDTH 64
83#endif
84
85inline void mul_vec_q_n_f32(
86 global void * src0,
87 global float * src1,
88 global float * dst,
89 int ne00,
90 int ne01,
91 int ne02,
92 int ne10,
93 int ne12,
94 int ne0,
95 int ne1,
96 int r2,
97 int r3
98) {
99
100 const ulong nb = ne00/QK4_0;
101
102 int r0 = get_group_id(0);
103 int r1 = get_group_id(1);
104 int im = get_group_id(2);
105
106 // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
107 // id of a SIMD group in the grid.
108 int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
109
110 int i12 = im%ne12;
111 int i13 = im/ne12;
112
113 ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
114
115 global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
116 global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
117
118 float yl[16]; // src1 vector cache
119 float sumf[N_DST]={0.f};
120
121 int ix = get_sub_group_local_id()/2;
122 int il = 8*(get_sub_group_local_id()%2);
123
124 global float * yb = y + ix * QK4_0 + il;
125
126 // each thread in a SIMD group deals with half a block.
127 for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
128 float sumy = 0;
129 for (int i = 0; i < 8; i += 2) {
130 sumy += yb[i] + yb[i+1];
131 yl[i+0] = yb[i+ 0];
132 yl[i+1] = yb[i+ 1]/256.f;
133 sumy += yb[i+16] + yb[i+17];
134 yl[i+8] = yb[i+16]/16.f;
135 yl[i+9] = yb[i+17]/4096.f;
136 }
137
138 for (int row = 0; row < N_DST; row++) {
139 sumf[row] += block_q_4_0_dot_y(x+ib+row*nb, sumy, yl, il);
140 }
141
142 // One thread in a SIMD group (i.e., subgroup) handles a half block,
143 // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
144 // y points to the activation matrix (of type float). Therefore for
145 // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
146 // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
147 // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
148 yb += QK4_0 * (N_SIMDWIDTH/2);
149 }
150
151 // The above does not work for Adreno - it produces incorrect results for
152 // row = 1, 2, 3 and only row = 0 gives the correct result.
153 // If N_DST is changed, the below array must be initialized accordingly.
154 // This also seems to perform better on Intel.
155 float tot[N_DST] = {
156 sub_group_reduce_add(sumf[0]), sub_group_reduce_add(sumf[1]),
157 sub_group_reduce_add(sumf[2]), sub_group_reduce_add(sumf[3])};
158 for (int row = 0; row < N_DST; ++row) {
159 if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
160 dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot[row];
161 }
162 }
163}
164
165#ifdef INTEL_GPU
166REQD_SUBGROUP_SIZE_16
167#elif defined (ADRENO_GPU)
168REQD_SUBGROUP_SIZE_64
169#endif
170kernel void kernel_mul_mat_q4_0_f32(
171 global void * src0,
172 ulong offset0,
173 global float * src1,
174 ulong offset1,
175 global float * dst,
176 ulong offsetd,
177 int ne00,
178 int ne01,
179 int ne02,
180 int ne10,
181 int ne12,
182 int ne0,
183 int ne1,
184 int r2,
185 int r3
186) {
187 src0 = (global void*)((global char*)src0 + offset0);
188 src1 = (global float*)((global char*)src1 + offset1);
189 dst = (global float*)((global char*)dst + offsetd);
190
191 mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
192}