1#version 450
  2
  3#extension GL_EXT_shader_16bit_storage : require
  4#extension GL_EXT_control_flow_attributes : require
  5#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
  6
  7#include "rte.glsl"
  8#include "types.glsl"
  9
 10layout (push_constant) uniform parameter
 11{
 12    BDA_STORAGE_T dst_addr;
 13    uint32_t nb10;
 14    uint32_t nb11;
 15    uint32_t nb12;
 16    uint32_t nb13;
 17    uint32_t s0;
 18    uint32_t s1;
 19    uint32_t s2;
 20    uint32_t p0;
 21    uint32_t p1;
 22    uint32_t p2;
 23    uint32_t d0;
 24    uint32_t d1;
 25    uint32_t d2;
 26    uint32_t IW;
 27    uint32_t IH;
 28    uint32_t ID;
 29    uint32_t IC;
 30    uint32_t KW;
 31    uint32_t OH;
 32    uint32_t KD_KH_KW;
 33    uint32_t KH_KW;
 34    uint32_t IC_KD_KH_KW;
 35    uint32_t N_OD_OH;
 36    uint32_t OD_OH;
 37    uint32_t OD_OH_OW_IC_KD_KH_KW;
 38    uint32_t OH_OW_IC_KD_KH_KW;
 39    uint32_t OW_IC_KD_KH_KW;
 40    uint32_t misalign_offsets;
 41} p;
 42
 43uint get_aoffset() { return p.misalign_offsets >> 16; }
 44uint get_doffset() { return p.misalign_offsets & 0xFFFF; }
 45
 46layout(constant_id = 0) const uint BLOCK_SIZE = 32;
 47
 48layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 49
 50layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 51layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 52
 53#if BDA
 54layout (buffer_reference) buffer D_ptr {D_TYPE d;};
 55#endif
 56
 57void main() {
 58    const uint32_t i = gl_GlobalInvocationID.x;
 59
 60    uint32_t nb10 = p.nb10;
 61    uint32_t nb11 = p.nb11;
 62    uint32_t nb12 = p.nb12;
 63    uint32_t nb13 = p.nb13;
 64    uint32_t s0 = p.s0;
 65    uint32_t s1 = p.s1;
 66    uint32_t s2 = p.s2;
 67    uint32_t p0 = p.p0;
 68    uint32_t p1 = p.p1;
 69    uint32_t p2 = p.p2;
 70    uint32_t d0 = p.d0;
 71    uint32_t d1 = p.d1;
 72    uint32_t d2 = p.d2;
 73    uint32_t IW = p.IW;
 74    uint32_t IH = p.IH;
 75    uint32_t ID = p.ID;
 76    uint32_t IC = p.IC;
 77    uint32_t KW = p.KW;
 78    uint32_t OH = p.OH;
 79    uint32_t KD_KH_KW = p.KD_KH_KW;
 80    uint32_t KH_KW = p.KH_KW;
 81    uint32_t IC_KD_KH_KW = p.IC_KD_KH_KW;
 82    uint32_t N_OD_OH = p.N_OD_OH;
 83    uint32_t OD_OH = p.OD_OH;
 84    uint32_t OD_OH_OW_IC_KD_KH_KW = p.OD_OH_OW_IC_KD_KH_KW;
 85    uint32_t OH_OW_IC_KD_KH_KW = p.OH_OW_IC_KD_KH_KW;
 86    uint32_t OW_IC_KD_KH_KW = p.OW_IC_KD_KH_KW;
 87
 88    if (i >= IC_KD_KH_KW) {
 89        return;
 90    }
 91
 92    const uint32_t iic = i / KD_KH_KW;
 93    const uint32_t ikd = (i - iic * KD_KH_KW) / KH_KW;
 94    const uint32_t ikh = (i - iic * KD_KH_KW - ikd * KH_KW) / KW;
 95    const uint32_t ikw = i % KW;
 96
 97    const uint32_t iow = gl_GlobalInvocationID.y;
 98    for (uint32_t iz = gl_GlobalInvocationID.z; iz < N_OD_OH; iz += gl_NumWorkGroups.z) {
 99        const uint32_t in_ = iz / OD_OH;
100        const uint32_t iod = (iz - in_*OD_OH) / OH;
101        const uint32_t ioh = iz % OH;
102
103        const uint32_t iiw = iow * s0 + ikw * d0 - p0;
104        const uint32_t iih = ioh * s1 + ikh * d1 - p1;
105        const uint32_t iid = iod * s2 + ikd * d2 - p2;
106
107        const BDA_OFFSET_T offset_dst = BDA_OFFSET_T(in_)*OD_OH_OW_IC_KD_KH_KW + BDA_OFFSET_T(iod)*OH_OW_IC_KD_KH_KW + BDA_OFFSET_T(ioh)*OW_IC_KD_KH_KW + BDA_OFFSET_T(iow)*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;
108
109        const uint32_t offset_src = (in_*IC + iic)*nb13 + iid*nb12 + iih*nb11 + iiw*nb10;
110#if BDA
111        D_ptr dst_addr = D_ptr(p.dst_addr + D_SIZE * offset_dst);
112        if (iih >= IH || iiw >= IW || iid >= ID) {
113            dst_addr.d = D_TYPE(0.0f);
114        } else {
115            dst_addr.d = D_TYPE(data_a[offset_src + get_aoffset()]);
116        }
117#else
118        if (iih >= IH || iiw >= IW || iid >= ID) {
119            data_d[offset_dst + get_doffset()] = D_TYPE(0.0f);
120        } else {
121            data_d[offset_dst + get_doffset()] = D_TYPE(data_a[offset_src + get_aoffset()]);
122        }
123#endif
124    }
125}