1#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2
3kernel void kernel_tanh_f32(
4 global const float * src0,
5 ulong offset0,
6 global float * dst,
7 ulong offsetd
8) {
9 src0 = (global float*)((global char*)src0 + offset0);
10 dst = (global float*)((global char*)dst + offsetd);
11
12 dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
13}
14
15kernel void kernel_tanh_f32_4(
16 global const float4 * src0,
17 ulong offset0,
18 global float4 * dst,
19 ulong offsetd
20) {
21 src0 = (global float4*)((global char*)src0 + offset0);
22 dst = (global float4*)((global char*)dst + offsetd);
23
24 dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
25}
26
27kernel void kernel_tanh_f16(
28 global const half * src0,
29 ulong offset0,
30 global half * dst,
31 ulong offsetd
32) {
33 src0 = (global half*)((global char*)src0 + offset0);
34 dst = (global half*)((global char*)dst + offsetd);
35
36 dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
37}
38
39kernel void kernel_tanh_f16_4(
40 global const half4 * src0,
41 ulong offset0,
42 global half4 * dst,
43 ulong offsetd
44) {
45 src0 = (global half4*)((global char*)src0 + offset0);
46 dst = (global half4*)((global char*)dst + offsetd);
47
48 dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
49}
50
51kernel void kernel_tanh_f32_nc(
52 global const char * src0,
53 ulong offset0,
54 global char * dst,
55 ulong offsetd,
56 int ne00,
57 ulong nb00,
58 ulong nb01,
59 ulong nb02,
60 ulong nb03,
61 ulong nb0,
62 ulong nb1,
63 ulong nb2,
64 ulong nb3
65) {
66 src0 = src0 + offset0;
67 dst = dst + offsetd;
68
69 const int i3 = get_group_id(2);
70 const int i2 = get_group_id(1);
71 const int i1 = get_group_id(0);
72
73 for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
74 global const float * x = (global const float *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
75 global float * y = (global float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
76
77 *y = tanh(*x);
78 }
79}
80
81kernel void kernel_tanh_f16_nc(
82 global const char * src0,
83 ulong offset0,
84 global char * dst,
85 ulong offsetd,
86 int ne00,
87 ulong nb00,
88 ulong nb01,
89 ulong nb02,
90 ulong nb03,
91 ulong nb0,
92 ulong nb1,
93 ulong nb2,
94 ulong nb3
95) {
96 src0 = src0 + offset0;
97 dst = dst + offsetd;
98
99 const int i3 = get_group_id(2);
100 const int i2 = get_group_id(1);
101 const int i1 = get_group_id(0);
102
103 for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
104 global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
105 global half * y = (global half *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
106
107 *y = tanh(*x);
108 }
109}