1#pragma OPENCL EXTENSION cl_khr_fp16 : enable
  2
  3#ifdef cl_intel_subgroups
  4#pragma OPENCL EXTENSION cl_intel_subgroups : enable
  5#else
  6#pragma OPENCL EXTENSION cl_khr_subgroups : enable
  7#endif
  8
  9#ifdef cl_intel_required_subgroup_size
 10#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 11#define INTEL_GPU 1
 12#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 13#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 14#elif defined(cl_qcom_reqd_sub_group_size)
 15#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 16#define ADRENO_GPU 1
 17#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 18#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 19#endif
 20
 21#define QK4_0                   32
 22#define QR4_0                   2
 23#define QK4_1                   32
 24#define QR4_1                   2
 25#define QK5_0                   32
 26#define QR5_0                   2
 27#define QK5_1                   32
 28#define QR5_1                   2
 29#define QK8_0                   32
 30#define QR8_0                   1
 31#define QK_K                    256
 32#define K_QUANTS_PER_ITERATION  2
 33
 34typedef char int8_t;
 35typedef uchar uint8_t;
 36typedef short int16_t;
 37typedef ushort uint16_t;
 38typedef int int32_t;
 39typedef uint uint32_t;
 40
 41//------------------------------------------------------------------------------
 42// block_q4_0
 43//------------------------------------------------------------------------------
 44struct block_q4_0
 45{
 46    half d;
 47    uint8_t qs[QK4_0 / 2];
 48};
 49
 50//------------------------------------------------------------------------------
 51// mul_vec_q_n_f32
 52//------------------------------------------------------------------------------
 53// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
 54// il indicates where the q4 quants begin (0 or QK4_0/4)
 55// we assume that the yl's have been multiplied with the appropriate scale factor
 56// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 57inline float block_q_4_0_dot_y(
 58        global struct block_q4_0 * qb_curr,
 59        float sumy,
 60        private float * yl,
 61        int il
 62) {
 63    float d = qb_curr->d;
 64    float2 acc = 0.f;
 65    global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
 66    for (int i = 0; i < 8; i+=2) {
 67        acc.s0 += yl[i + 0] * (qs[i / 2] & 0x000F)
 68                + yl[i + 1] * (qs[i / 2] & 0x0F00);
 69        acc.s1 += yl[i + 8] * (qs[i / 2] & 0x00F0)
 70                + yl[i + 9] * (qs[i / 2] & 0xF000);
 71    }
 72    return d * (sumy * -8.f + acc.s0 + acc.s1);
 73}
 74
 75#ifdef INTEL_GPU
 76#define N_DST 4 // each SIMD group works on 4 rows
 77#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
 78#define N_SIMDWIDTH 16 // assuming SIMD group size is 16
 79#elif defined (ADRENO_GPU)
 80#define N_DST 4
 81#define N_SIMDGROUP 1
 82#define N_SIMDWIDTH 64
 83#endif
 84
 85inline void mul_vec_q_n_f32(
 86        global void * src0,
 87        global float * src1,
 88        global float * dst,
 89        int ne00,
 90        int ne01,
 91        int ne02,
 92        int ne10,
 93        int ne12,
 94        int ne0,
 95        int ne1,
 96        int r2,
 97        int r3
 98) {
 99
100    const ulong nb = ne00/QK4_0;
101
102    int r0 = get_group_id(0);
103    int r1 = get_group_id(1);
104    int im = get_group_id(2);
105
106    // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
107    // id of a SIMD group in the grid.
108    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
109
110    int i12 = im%ne12;
111    int i13 = im/ne12;
112
113    ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
114
115    global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
116    global float             * y = (global float             *) src1 + r1*ne10 + im*ne00*ne1;
117
118    float yl[16];       // src1 vector cache
119    float sumf[N_DST]={0.f};
120
121    int ix = get_sub_group_local_id()/2;
122    int il = 8*(get_sub_group_local_id()%2);
123
124    global float * yb = y + ix * QK4_0 + il;
125
126    // each thread in a SIMD group deals with half a block.
127    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
128        float sumy = 0;
129        for (int i = 0; i < 8; i += 2) {
130            sumy += yb[i] + yb[i+1];
131            yl[i+0] = yb[i+ 0];
132            yl[i+1] = yb[i+ 1]/256.f;
133            sumy += yb[i+16] + yb[i+17];
134            yl[i+8] = yb[i+16]/16.f;
135            yl[i+9] = yb[i+17]/4096.f;
136        }
137
138        for (int row = 0; row < N_DST; row++) {
139            sumf[row] += block_q_4_0_dot_y(x+ib+row*nb, sumy, yl, il);
140        }
141
142        // One thread in a SIMD group (i.e., subgroup) handles a half block,
143        // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
144        // y points to the activation matrix (of type float). Therefore for
145        // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
146        // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
147        // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
148        yb += QK4_0 * (N_SIMDWIDTH/2);
149    }
150
151    // The above does not work for Adreno - it produces incorrect results for
152    // row = 1, 2, 3 and only row = 0 gives the correct result.
153    // If N_DST is changed, the below array must be initialized accordingly.
154    // This also seems to perform better on Intel.
155    float tot[N_DST] = {
156        sub_group_reduce_add(sumf[0]), sub_group_reduce_add(sumf[1]),
157        sub_group_reduce_add(sumf[2]), sub_group_reduce_add(sumf[3])};
158    for (int row = 0; row < N_DST; ++row) {
159        if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
160            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot[row];
161        }
162    }
163}
164
165#ifdef INTEL_GPU
166REQD_SUBGROUP_SIZE_16
167#elif defined (ADRENO_GPU)
168REQD_SUBGROUP_SIZE_64
169#endif
170kernel void kernel_mul_mat_q4_0_f32(
171        global void * src0,
172        ulong offset0,
173        global float * src1,
174        ulong offset1,
175        global float * dst,
176        ulong offsetd,
177        int ne00,
178        int ne01,
179        int ne02,
180        int ne10,
181        int ne12,
182        int ne0,
183        int ne1,
184        int r2,
185        int r3
186) {
187    src0 = (global void*)((global char*)src0 + offset0);
188    src1 = (global float*)((global char*)src1 + offset1);
189    dst = (global float*)((global char*)dst + offsetd);
190
191    mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
192}