1kernel void kernel_repeat_f32(
 2        global const char * src0,
 3        ulong               offset0,
 4        global       char * dst,
 5        ulong               offsetd,
 6        int     ne00,
 7        int     ne01,
 8        int     ne02,
 9        int     ne03,
10        ulong   nb00,
11        ulong   nb01,
12        ulong   nb02,
13        ulong   nb03,
14        int     ne0,
15        ulong   nb0,
16        ulong   nb1,
17        ulong   nb2,
18        ulong   nb3
19) {
20    src0 = src0 + offset0;
21    dst  = dst  + offsetd;
22
23    const int i3 = get_group_id(2);
24    const int i2 = get_group_id(1);
25    const int i1 = get_group_id(0);
26
27    const int i03 = i3%ne03;
28    const int i02 = i2%ne02;
29    const int i01 = i1%ne01;
30
31    global const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
32    global       char * dst_ptr  = dst  +  i3*nb3  +  i2*nb2  +  i1*nb1;
33
34    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
35        const int i00 = i0%ne00;
36        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i00*nb00));
37    }
38}