diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp')
| -rw-r--r-- | llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp b/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp new file mode 100644 index 0000000..13ba2e9 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp | |||
| @@ -0,0 +1,47 @@ | |||
| 1 | #version 450 | ||
| 2 | |||
| 3 | #include "types.glsl" | ||
| 4 | #include "sum_rows.glsl" | ||
| 5 | |||
| 6 | #extension GL_EXT_control_flow_attributes : enable | ||
| 7 | |||
| 8 | layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; | ||
| 9 | |||
| 10 | layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; | ||
| 11 | layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; | ||
| 12 | |||
| 13 | layout (constant_id = 0) const uint BLOCK_SIZE = 32; | ||
| 14 | |||
| 15 | shared FLOAT_TYPE tmp[BLOCK_SIZE]; | ||
| 16 | |||
| 17 | void main() { | ||
| 18 | const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; | ||
| 19 | const uint col = gl_LocalInvocationID.x; | ||
| 20 | const float weight = p.weight; | ||
| 21 | |||
| 22 | const uint i03 = fastdiv(row, p.ne0_12mp, p.ne0_12L); | ||
| 23 | const uint i03_offset = i03 * p.ne01*p.ne02; | ||
| 24 | const uint i02 = fastdiv(row - i03_offset, p.ne0_1mp, p.ne0_1L); | ||
| 25 | const uint i01 = row - i03_offset - i02*p.ne01; | ||
| 26 | |||
| 27 | const uint src_idx = get_aoffset() + i01 * p.nb01 + i02 * p.nb02 + i03 * p.nb03; | ||
| 28 | const uint dst_idx = get_doffset() + i01 * p.nb11 + i02 * p.nb12 + i03 * p.nb13; | ||
| 29 | |||
| 30 | tmp[col] = FLOAT_TYPE(0.0); | ||
| 31 | |||
| 32 | for (uint i = col; i < p.n_cols; i += BLOCK_SIZE) { | ||
| 33 | tmp[col] += FLOAT_TYPE(data_a[src_idx + i]); | ||
| 34 | } | ||
| 35 | |||
| 36 | barrier(); | ||
| 37 | [[unroll]] for (int s = int(BLOCK_SIZE) / 2; s > 0; s >>= 1) { | ||
| 38 | if (col < s) { | ||
| 39 | tmp[col] += tmp[col + s]; | ||
| 40 | } | ||
| 41 | barrier(); | ||
| 42 | } | ||
| 43 | |||
| 44 | if (col == 0) { | ||
| 45 | data_d[dst_idx] = D_TYPE(tmp[0] * weight); | ||
| 46 | } | ||
| 47 | } | ||
