1#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 2
 3kernel void kernel_im2col_f32(
 4        global float * src1,
 5        ulong offset1,
 6        global float * dst,
 7        ulong offsetd,
 8        ulong batch_offset,
 9        ulong delta_offset,
10        long IW,
11        long IH,
12        long IC,
13        long OW,
14        long OH,
15        long KW,
16        long KH,
17        long pelements,
18        long CHW,
19        int  s0,
20        int  s1,
21        int  p0,
22        int  p1,
23        int  d0,
24        int  d1
25) {
26    long i = get_global_id(0);
27    if (i >= pelements) {
28        return;
29    }
30
31    src1 = (global float*)((global char*)src1 + offset1);
32    dst = (global float*)((global char*)dst + offsetd);
33
34    long  ksize = OW * KH;
35    long  kx = i / ksize;
36    long  kd = kx * ksize;
37    long  ky = (i - kd) / OW;
38    long  ix = i % OW;
39
40    long  oh = get_group_id(1);
41    long  batch = get_group_id(2) / IC;
42    long  ic = get_group_id(2) % IC;
43
44    long iiw = ix * s0 + kx * d0 - p0;
45    long iih = oh * s1 + ky * d1 - p1;
46
47    long offset_dst =
48        ((batch * OH + oh) * OW + ix) * CHW +
49        (ic * (KW * KH) + ky * KW + kx);
50
51    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
52        dst[offset_dst] = 0.0f;
53    } else {
54        long offset_src = ic * delta_offset + batch * batch_offset;
55        dst[offset_dst] = src1[offset_src + iih * IW + iiw];
56    }
57}