1#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
3#ifdef cl_intel_subgroups
4#pragma OPENCL EXTENSION cl_intel_subgroups : enable
5#else
6#pragma OPENCL EXTENSION cl_khr_subgroups : enable
7#endif
8
9#ifdef cl_intel_required_subgroup_size
10#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11#define INTEL_GPU 1
12#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14#elif defined(cl_qcom_reqd_sub_group_size)
15#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16#define ADRENO_GPU 1
17#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19#endif
20
21#define QK8_0 32
22typedef struct {
23 half d; // delta
24 char qs[QK8_0]; // quants
25} block_q8_0;
26
27#define NB_Q8_0 8
28
29#ifdef INTEL_GPU
30#define N_R0_Q8_0 4 // number of rows each subgroup works on
31#define N_SG_Q8_0 2 // number of subgroups in a work group
32#define N_SIMDWIDTH 16 // subgroup size
33#elif defined (ADRENO_GPU)
34#define N_R0_Q8_0 4
35#define N_SG_Q8_0 2
36#define N_SIMDWIDTH 64
37#endif
38
39#ifdef INTEL_GPU
40REQD_SUBGROUP_SIZE_16
41#elif defined (ADRENO_GPU)
42REQD_SUBGROUP_SIZE_64
43#endif
44kernel void kernel_mul_mv_q8_0_f32(
45 global char * src0,
46 ulong offset0,
47 global char * src1,
48 ulong offset1,
49 global char * dst,
50 ulong offsetd,
51 int ne00,
52 int ne01,
53 ulong nb01,
54 ulong nb02,
55 ulong nb03,
56 int ne12,
57 ulong nb11,
58 ulong nb12,
59 ulong nb13,
60 int ne0,
61 int ne1,
62 int r2,
63 int r3
64) {
65 src0 = (global char*)((global char*)src0 + offset0);
66 src1 = (global char*)((global char*)src1 + offset1);
67 dst = (global char*)((global char*)dst + offsetd);
68
69 int nb = ne00/QK8_0;
70
71 int r0 = get_group_id(0);
72 int r1 = get_group_id(1);
73 int im = get_group_id(2);
74
75 int first_row = (r0*N_SG_Q8_0 + get_sub_group_id()) * N_R0_Q8_0;
76
77 uint i12 = im%ne12;
78 uint i13 = im/ne12;
79
80 ulong offset_src1 = r1*nb11 + i12*nb12 + i13*nb13;
81 global float * y = (global float *) (src1 + offset_src1);
82
83 // pointers to src0 rows
84 global block_q8_0 * ax[N_R0_Q8_0];
85 for (int row = 0; row < N_R0_Q8_0; ++row) {
86 ulong offset_src0 = (first_row + row)*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
87 ax[row] = (global block_q8_0 *) ((global char *) src0 + offset_src0);
88 }
89
90 float yl[NB_Q8_0];
91 float sumf[N_R0_Q8_0] = { 0.f };
92
93 const short ix = get_sub_group_local_id()/4;
94 const short il = get_sub_group_local_id()%4;
95
96 global float * yb = y + ix*QK8_0 + il*NB_Q8_0;
97
98 // each thread handles NB_Q8_0 quants at a time
99 for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/4) {
100 for (short i = 0; i < NB_Q8_0; ++i) {
101 yl[i] = yb[i];
102 }
103
104 for (short row = 0; row < N_R0_Q8_0; row++) {
105 global char * qs = ax[row][ib].qs + il*NB_Q8_0;
106 float sumq = 0.f;
107 for (short iq = 0; iq < NB_Q8_0; ++iq) {
108 sumq += qs[iq] * yl[iq];
109 }
110 sumf[row] += sumq*ax[row][ib].d;
111 }
112
113 yb += N_SIMDWIDTH*NB_Q8_0;
114 }
115
116 global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
117
118 for (int row = 0; row < N_R0_Q8_0; ++row) {
119 float tot = sub_group_reduce_add(sumf[row]);
120
121 if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
122 dst_f32[first_row + row] = tot;
123 }
124 }
125}