1#version 450
 2
 3#extension GL_EXT_control_flow_attributes : require
 4
 5#define BLOCK_SIZE 64
 6layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
 7
 8layout(push_constant) uniform Parameters {
 9    uint B;
10    uint T;
11    uint C;
12    uint H;
13};
14
15layout(binding = 0) readonly buffer RBuf { A_TYPE r[]; };
16layout(binding = 1) readonly buffer WBuf { A_TYPE w[]; };
17layout(binding = 2) readonly buffer KBuf { A_TYPE k[]; };
18layout(binding = 3) readonly buffer VBuf { A_TYPE v[]; };
19layout(binding = 4) readonly buffer ABuf { A_TYPE a[]; };
20layout(binding = 5) readonly buffer BBuf { A_TYPE b[]; };
21layout(binding = 6) readonly buffer StateBuf { A_TYPE state_in[]; };
22layout(binding = 7) buffer DstBuf { A_TYPE dst[]; };
23
24shared A_TYPE _r[BLOCK_SIZE], _w[BLOCK_SIZE], _k[BLOCK_SIZE], _a[BLOCK_SIZE], _b[BLOCK_SIZE];
25
26void main() {
27    const uint head_size = BLOCK_SIZE;
28    const uint batch_id = gl_WorkGroupID.x / H;
29    const uint head_id = gl_WorkGroupID.x % H;
30    const uint tid = gl_LocalInvocationID.x;
31
32    const uint state_size = C * head_size;
33    const uint n_seq_tokens = T / B;
34
35    if (batch_id >= B || head_id >= H) {
36        return;
37    }
38
39    A_TYPE state[BLOCK_SIZE];
40    [[unroll]] for (uint i = 0; i < head_size; i++) {
41        state[i] = state_in[batch_id * state_size + head_id * head_size * head_size
42                          + tid * head_size + i];
43    }
44
45    const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid;
46    const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid;
47
48    for (uint t = start_t; t < end_t; t += C) {
49        barrier();
50        _r[tid] = r[t];
51        _w[tid] = w[t];
52        _k[tid] = k[t];
53        _a[tid] = a[t];
54        _b[tid] = b[t];
55        barrier();
56
57        A_TYPE sa = 0.0;
58        [[unroll]] for (uint j = 0; j < head_size; j += 4) {
59            vec4 s_vec = vec4(state[j], state[j+1], state[j+2], state[j+3]);
60            vec4 a_vec = vec4(_a[j], _a[j+1], _a[j+2], _a[j+3]);
61            sa += dot(s_vec, a_vec);
62        }
63
64        const A_TYPE v_val = v[t];
65        A_TYPE y = 0.0;
66
67        [[unroll]] for (uint j = 0; j < head_size; j += 4) {
68            vec4 r_vec = vec4(_r[j], _r[j+1], _r[j+2], _r[j+3]);
69            vec4 w_vec = vec4(_w[j], _w[j+1], _w[j+2], _w[j+3]);
70            vec4 k_vec = vec4(_k[j], _k[j+1], _k[j+2], _k[j+3]);
71            vec4 b_vec = vec4(_b[j], _b[j+1], _b[j+2], _b[j+3]);
72            vec4 s_vec = vec4(state[j], state[j+1], state[j+2], state[j+3]);
73
74            vec4 kv = k_vec * v_val;
75            s_vec = s_vec * w_vec + kv + sa * b_vec;
76            y += dot(r_vec, s_vec);
77
78            state[j] = s_vec.x;
79            state[j+1] = s_vec.y;
80            state[j+2] = s_vec.z;
81            state[j+3] = s_vec.w;
82        }
83
84        dst[t] = y;
85    }
86
87    [[unroll]] for (uint i = 0; i < head_size; i++) {
88        dst[T * C + batch_id * state_size + head_id * head_size * head_size
89            + tid * head_size + i] = state[i];
90    }
91}