1#pragma OPENCL EXTENSION cl_khr_fp16 : enable
  2
  3kernel void kernel_tanh_f32(
  4        global const float * src0,
  5        ulong                offset0,
  6        global       float * dst,
  7        ulong                offsetd
  8) {
  9    src0 = (global float*)((global char*)src0 + offset0);
 10    dst  = (global float*)((global char*)dst + offsetd);
 11
 12    dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
 13}
 14
 15kernel void kernel_tanh_f32_4(
 16        global const float4 * src0,
 17        ulong                 offset0,
 18        global       float4 * dst,
 19        ulong                 offsetd
 20) {
 21    src0 = (global float4*)((global char*)src0 + offset0);
 22    dst  = (global float4*)((global char*)dst + offsetd);
 23
 24    dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
 25}
 26
 27kernel void kernel_tanh_f16(
 28        global const half * src0,
 29        ulong               offset0,
 30        global       half * dst,
 31        ulong               offsetd
 32) {
 33    src0 = (global half*)((global char*)src0 + offset0);
 34    dst  = (global half*)((global char*)dst + offsetd);
 35
 36    dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
 37}
 38
 39kernel void kernel_tanh_f16_4(
 40        global const half4 * src0,
 41        ulong                offset0,
 42        global       half4 * dst,
 43        ulong                offsetd
 44) {
 45    src0 = (global half4*)((global char*)src0 + offset0);
 46    dst  = (global half4*)((global char*)dst + offsetd);
 47
 48    dst[get_global_id(0)] = tanh(src0[get_global_id(0)]);
 49}
 50
 51kernel void kernel_tanh_f32_nc(
 52        global const char * src0,
 53        ulong               offset0,
 54        global       char * dst,
 55        ulong               offsetd,
 56        int   ne00,
 57        ulong nb00,
 58        ulong nb01,
 59        ulong nb02,
 60        ulong nb03,
 61        ulong nb0,
 62        ulong nb1,
 63        ulong nb2,
 64        ulong nb3
 65) {
 66    src0 = src0 + offset0;
 67    dst  = dst + offsetd;
 68
 69    const int i3 = get_group_id(2);
 70    const int i2 = get_group_id(1);
 71    const int i1 = get_group_id(0);
 72
 73    for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
 74        global const float * x = (global const float *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
 75        global       float * y = (global       float *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 76
 77        *y = tanh(*x);
 78    }
 79}
 80
 81kernel void kernel_tanh_f16_nc(
 82        global const char * src0,
 83        ulong               offset0,
 84        global       char * dst,
 85        ulong               offsetd,
 86        int   ne00,
 87        ulong nb00,
 88        ulong nb01,
 89        ulong nb02,
 90        ulong nb03,
 91        ulong nb0,
 92        ulong nb1,
 93        ulong nb2,
 94        ulong nb3
 95) {
 96    src0 = src0 + offset0;
 97    dst  = dst + offsetd;
 98
 99    const int i3 = get_group_id(2);
100    const int i2 = get_group_id(1);
101    const int i1 = get_group_id(0);
102
103    for (int i0 = get_local_id(0); i0 < ne00; i0 += get_local_size(0)) {
104        global const half * x = (global const half *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
105        global       half * y = (global       half *)(dst  + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
106
107        *y = tanh(*x);
108    }
109}