1#pragma OPENCL EXTENSION cl_khr_fp16 : enable
  2
  3#ifdef cl_intel_subgroups
  4#pragma OPENCL EXTENSION cl_intel_subgroups : enable
  5#else
  6#pragma OPENCL EXTENSION cl_khr_subgroups : enable
  7#endif
  8
  9#ifdef cl_intel_required_subgroup_size
 10#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 11#define INTEL_GPU 1
 12#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 13#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 14#elif defined(cl_qcom_reqd_sub_group_size)
 15#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 16#define ADRENO_GPU 1
 17#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 18#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 19#endif
 20
 21// Workgroup must be a subgroup
 22#ifdef INTEL_GPU
 23REQD_SUBGROUP_SIZE_32
 24#elif defined (ADRENO_GPU)
 25REQD_SUBGROUP_SIZE_64
 26#endif
 27kernel void kernel_group_norm(
 28        global float * src0,
 29        ulong offset0,
 30        global float * dst,
 31        ulong offsetd,
 32        int ne,
 33        int group_size,
 34        float eps
 35) {
 36    src0 = (global float  *)((global char *)src0 + offset0);
 37    dst  = (global float *)((global char *)dst  + offsetd);
 38
 39    int start = get_group_id(0) * group_size;
 40    int end   = start + group_size;
 41
 42    start += get_local_id(0);
 43
 44    if (end >= ne) {
 45        end = ne;
 46    }
 47
 48    float tmp = 0.0f;
 49
 50    for (int j = start; j < end; j += get_local_size(0)) {
 51        tmp += src0[j];
 52    }
 53
 54    tmp = sub_group_reduce_add(tmp);
 55
 56    const float mean = tmp / group_size;
 57    tmp = 0.0f;
 58
 59    for (int j = start; j < end; j += get_local_size(0)) {
 60        float xi = src0[j] - mean;
 61        dst[j] = xi;
 62        tmp += xi * xi;
 63    }
 64
 65    tmp = sub_group_reduce_add(tmp);
 66
 67    const float variance = tmp / group_size;
 68    const float scale = 1.0f/sqrt(variance + eps);
 69    for (int j = start; j < end; j += get_local_size(0)) {
 70        dst[j] *= scale;
 71    }
 72}
 73
 74//------------------------------------------------------------------------------
 75// group_norm_mul_add
 76//------------------------------------------------------------------------------
 77#ifdef INTEL_GPU
 78REQD_SUBGROUP_SIZE_32
 79#elif defined (ADRENO_GPU)
 80REQD_SUBGROUP_SIZE_64
 81#endif
 82kernel void kernel_group_norm_mul_add(
 83        global float * src0, ulong offset0,
 84        global float * src1, ulong offset1,
 85        global float * src2, ulong offset2,
 86        global float * dst, ulong offsetd,
 87        int ne,
 88        int group_size,
 89        float eps
 90) {
 91    src0 = (global float *)((global char *)src0 + offset0);
 92    src1 = (global float *)((global char *)src1 + offset1);
 93    src2 = (global float *)((global char *)src2 + offset2);
 94    dst  = (global float *)((global char *)dst  + offsetd);
 95
 96    int start = get_group_id(0) * group_size;
 97    int end = start + group_size;
 98    if (end > ne) {
 99        end = ne;
100    }
101
102    float sum = 0.0f;
103    float sum_sq = 0.0f;
104
105    for (int j = start + get_local_id(0); j < end; j += get_local_size(0)) {
106        float val = src0[j];
107        sum += val;
108        sum_sq += val*val;
109    }
110
111    sum = sub_group_reduce_add(sum);
112    sum_sq = sub_group_reduce_add(sum_sq);
113
114    const float mean = sum / group_size;
115    const float var = sum_sq / group_size - mean * mean;
116    const float scale = rsqrt(var + eps);
117
118    for (int j = start + get_local_id(0); j < end; j += get_local_size(0)) {
119        dst[j] = ((src0[j] - mean) * scale) * src1[j] + src2[j];
120    }
121}