1#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
3#ifdef cl_intel_subgroups
4#pragma OPENCL EXTENSION cl_intel_subgroups : enable
5#else
6#pragma OPENCL EXTENSION cl_khr_subgroups : enable
7#endif
8
9#ifdef cl_intel_required_subgroup_size
10#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11#define INTEL_GPU 1
12#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14#elif defined(cl_qcom_reqd_sub_group_size)
15#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16#define ADRENO_GPU 1
17#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19#endif
20
21// Workgroup must be a subgroup
22#ifdef INTEL_GPU
23REQD_SUBGROUP_SIZE_32
24#elif defined (ADRENO_GPU)
25REQD_SUBGROUP_SIZE_64
26#endif
27kernel void kernel_group_norm(
28 global float * src0,
29 ulong offset0,
30 global float * dst,
31 ulong offsetd,
32 int ne,
33 int group_size,
34 float eps
35) {
36 src0 = (global float *)((global char *)src0 + offset0);
37 dst = (global float *)((global char *)dst + offsetd);
38
39 int start = get_group_id(0) * group_size;
40 int end = start + group_size;
41
42 start += get_local_id(0);
43
44 if (end >= ne) {
45 end = ne;
46 }
47
48 float tmp = 0.0f;
49
50 for (int j = start; j < end; j += get_local_size(0)) {
51 tmp += src0[j];
52 }
53
54 tmp = sub_group_reduce_add(tmp);
55
56 const float mean = tmp / group_size;
57 tmp = 0.0f;
58
59 for (int j = start; j < end; j += get_local_size(0)) {
60 float xi = src0[j] - mean;
61 dst[j] = xi;
62 tmp += xi * xi;
63 }
64
65 tmp = sub_group_reduce_add(tmp);
66
67 const float variance = tmp / group_size;
68 const float scale = 1.0f/sqrt(variance + eps);
69 for (int j = start; j < end; j += get_local_size(0)) {
70 dst[j] *= scale;
71 }
72}
73
74//------------------------------------------------------------------------------
75// group_norm_mul_add
76//------------------------------------------------------------------------------
77#ifdef INTEL_GPU
78REQD_SUBGROUP_SIZE_32
79#elif defined (ADRENO_GPU)
80REQD_SUBGROUP_SIZE_64
81#endif
82kernel void kernel_group_norm_mul_add(
83 global float * src0, ulong offset0,
84 global float * src1, ulong offset1,
85 global float * src2, ulong offset2,
86 global float * dst, ulong offsetd,
87 int ne,
88 int group_size,
89 float eps
90) {
91 src0 = (global float *)((global char *)src0 + offset0);
92 src1 = (global float *)((global char *)src1 + offset1);
93 src2 = (global float *)((global char *)src2 + offset2);
94 dst = (global float *)((global char *)dst + offsetd);
95
96 int start = get_group_id(0) * group_size;
97 int end = start + group_size;
98 if (end > ne) {
99 end = ne;
100 }
101
102 float sum = 0.0f;
103 float sum_sq = 0.0f;
104
105 for (int j = start + get_local_id(0); j < end; j += get_local_size(0)) {
106 float val = src0[j];
107 sum += val;
108 sum_sq += val*val;
109 }
110
111 sum = sub_group_reduce_add(sum);
112 sum_sq = sub_group_reduce_add(sum_sq);
113
114 const float mean = sum / group_size;
115 const float var = sum_sq / group_size - mean * mean;
116 const float scale = rsqrt(var + eps);
117
118 for (int j = start + get_local_id(0); j < end; j += get_local_size(0)) {
119 dst[j] = ((src0[j] - mean) * scale) * src1[j] + src2[j];
120 }
121}