summaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp')
-rw-r--r--llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp456
1 files changed, 456 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
new file mode 100644
index 0000000..775e9a7
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@@ -0,0 +1,456 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+
+#ifdef FLOAT16
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#endif
+#if defined(DATA_A_IQ1_M)
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#endif
+
+#if defined(DATA_A_BF16) && defined(COOPMAT)
+#extension GL_EXT_bfloat16 : enable
+#endif
+
+#ifdef COOPMAT
+#extension GL_KHR_cooperative_matrix : enable
+#extension GL_KHR_memory_scope_semantics : enable
+#endif
+
+#if defined(COOPMAT) || defined(MUL_MAT_ID_USE_SUBGROUPS)
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_KHR_shader_subgroup_ballot : enable
+#endif
+
+#ifdef MUL_MAT_ID
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+#endif
+
+#include "types.glsl"
+
+#ifndef LOAD_VEC_A
+#define LOAD_VEC_A 1
+#endif
+#ifndef LOAD_VEC_B
+#define LOAD_VEC_B 1
+#endif
+
+// Load 2 values at once without affecting index calculations through LOAD_VEC
+#if (defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)) && !defined(ALIGNED)
+#define LOAD_VEC_BATCH_A 2
+#else
+#define LOAD_VEC_BATCH_A 1
+#endif
+#if !defined(ALIGNED)
+#define LOAD_VEC_BATCH_B 2
+#else
+#define LOAD_VEC_BATCH_B 1
+#endif
+
+#if !defined(TO_FLOAT_TYPE)
+#define TO_FLOAT_TYPE FLOAT_TYPE
+#endif
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+#if defined(A_TYPE_PACKED16)
+layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
+#endif
+#if defined(A_TYPE_PACKED32)
+layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
+#endif
+
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
+layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+
+#ifdef MUL_MAT_ID
+layout (binding = 3) readonly buffer IDS {int data_ids[];};
+layout (binding = 4) readonly buffer Counts {int data_expert_count[];};
+#endif
+
+layout (push_constant) uniform parameter
+{
+ uint M;
+ uint N;
+ uint K;
+ uint stride_a;
+ uint stride_b;
+ uint stride_d;
+
+ uint batch_stride_a;
+ uint batch_stride_b;
+ uint batch_stride_d;
+
+#ifdef MUL_MAT_ID
+ uint nei0;
+ uint nei1;
+ uint nbi1;
+ uint ne11;
+#else
+ uint k_split;
+ uint ne02;
+ uint ne12;
+ uint broadcast2;
+ uint broadcast3;
+#endif
+} p;
+
+layout (constant_id = 0) const uint BLOCK_SIZE = 64;
+layout (constant_id = 1) const uint BM = 64;
+layout (constant_id = 2) const uint BN = 64;
+layout (constant_id = 4) const uint WM = 32;
+layout (constant_id = 5) const uint WN = 32;
+layout (constant_id = 6) const uint WMITER = 2;
+layout (constant_id = 7) const uint TM = 4;
+layout (constant_id = 8) const uint TN = 2;
+layout (constant_id = 9) const uint TK = 1; // Only needed for coopmat
+layout (constant_id = 10) const uint WARP = 32;
+
+#if defined(DATA_A_F32) || defined(DATA_A_F16)
+#define BK 32
+#define BK_STEP 4
+#else
+layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant
+#define BK_STEP 2
+#endif
+
+#ifdef COOPMAT
+#define SHMEM_STRIDE (BK / 2 + 4)
+#else
+#define SHMEM_STRIDE (BK / 2 + 1)
+#endif
+
+shared FLOAT_TYPE_VEC2 buf_a[BM * SHMEM_STRIDE];
+shared FLOAT_TYPE_VEC2 buf_b[BN * SHMEM_STRIDE];
+
+#define NUM_WARPS (BLOCK_SIZE / WARP)
+
+#ifdef COOPMAT
+shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS];
+#endif
+
+#include "mul_mm_id_funcs.glsl"
+#include "mul_mm_funcs.glsl"
+
+void main() {
+ const uint ic = gl_WorkGroupID.y;
+
+#ifdef MUL_MAT_ID
+ const uint expert_idx = gl_GlobalInvocationID.z;
+ if (ic * BN >= data_expert_count[expert_idx]) {
+ return;
+ }
+#endif
+#ifdef NEEDS_INIT_IQ_SHMEM
+ init_iq_shmem(gl_WorkGroupSize);
+#endif
+
+#ifndef MUL_MAT_ID
+ const uint batch_idx = gl_GlobalInvocationID.z;
+
+ const uint i13 = batch_idx / p.ne12;
+ const uint i12 = batch_idx % p.ne12;
+
+ const uint i03 = i13 / p.broadcast3;
+ const uint i02 = i12 / p.broadcast2;
+
+ const uint batch_idx_a = i03 * p.ne02 + i02;
+#endif
+
+ const uint blocks_m = (p.M + BM - 1) / BM;
+ const uint ir = gl_WorkGroupID.x % blocks_m;
+ const uint ik = gl_WorkGroupID.x / blocks_m;
+
+ const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER);
+ const uint WSUBM = WM / WMITER;
+ const uint WSUBN = WN / WNITER;
+
+#ifdef COOPMAT
+ const uint warp_i = gl_SubgroupID;
+
+ const uint tiw = gl_SubgroupInvocationID;
+
+ const uint cms_per_row = WM / TM;
+ const uint cms_per_col = WN / TN;
+
+ const uint storestride = WARP / TM;
+ const uint store_r = tiw % TM;
+ const uint store_c = tiw / TM;
+#else
+ const uint warp_i = gl_LocalInvocationID.x / WARP;
+
+ const uint tiw = gl_LocalInvocationID.x % WARP;
+
+ const uint tiwr = tiw % (WSUBM / TM);
+ const uint tiwc = tiw / (WSUBM / TM);
+#endif
+
+ const uint warp_r = warp_i % (BM / WM);
+ const uint warp_c = warp_i / (BM / WM);
+
+ const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A / LOAD_VEC_BATCH_A);
+ const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A / LOAD_VEC_BATCH_A);
+ const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B / LOAD_VEC_BATCH_B);
+ const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B / LOAD_VEC_BATCH_B);
+
+ const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A * LOAD_VEC_BATCH_A / BK;
+ const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B * LOAD_VEC_BATCH_B / BK;
+
+#ifdef MUL_MAT_ID
+#ifdef MUL_MAT_ID_USE_SUBGROUPS
+ if (bitCount(p.nei0) == 1) {
+ load_row_ids(expert_idx, true, ic);
+ } else {
+ load_row_ids(expert_idx, false, ic);
+ }
+#else
+ _ne1 = 0;
+ for (uint ii1 = 0; ii1 < p.nei1 && _ne1 < (ic + 1) * BN; ii1++) {
+ for (uint ii0 = 0; ii0 < p.nei0 && _ne1 < (ic + 1) * BN; ii0++) {
+ if (data_ids[ii1*p.nbi1 + ii0] == expert_idx) {
+ if (_ne1 >= ic * BN) {
+ row_ids[_ne1 - ic * BN] = u16vec2(ii0, ii1);
+ }
+ _ne1++;
+ }
+ }
+ }
+
+ barrier();
+#endif
+
+ // Workgroup has no work
+ if (ic * BN >= _ne1) return;
+#endif
+
+#ifdef MUL_MAT_ID
+ const uint start_k = 0;
+ const uint end_k = p.K;
+#else
+ const uint start_k = ik * p.k_split;
+ const uint end_k = min(p.K, (ik + 1) * p.k_split);
+#endif
+
+ uint pos_a =
+#ifdef MUL_MAT_ID
+ expert_idx * (p.batch_stride_a / LOAD_VEC_A) +
+#else
+ batch_idx_a * (p.batch_stride_a / LOAD_VEC_A) +
+#endif
+ (ir * BM * p.stride_a + start_k) / LOAD_VEC_A;
+#ifdef MUL_MAT_ID
+ uint pos_b = 0;
+#else
+ uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B;
+#endif
+
+#ifdef COOPMAT
+ coopmat<FLOAT_TYPE, gl_ScopeSubgroup, TM, TK, gl_MatrixUseA> cache_a;
+ coopmat<FLOAT_TYPE, gl_ScopeSubgroup, TK, TN, gl_MatrixUseB> cache_b;
+ coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> sums[cms_per_row * cms_per_col];
+
+ [[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) {
+ sums[i] = coopmat<ACC_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(0.0f);
+ }
+#else
+ ACC_TYPE_VEC2 sums[WMITER * TM * WNITER * TN/2];
+#if defined(DATA_A_F32) || defined(DATA_A_F16)
+ FLOAT_TYPE_VEC4 cache_a[WMITER * TM];
+ FLOAT_TYPE_VEC4 cache_b;
+#else
+ FLOAT_TYPE_VEC2 cache_a[WMITER * TM];
+ FLOAT_TYPE_VEC2 cache_b;
+#endif
+
+ [[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN/2; i++) {
+ sums[i] = ACC_TYPE_VEC2(0.0f, 0.0f);
+ }
+#endif
+
+ for (uint block = start_k; block < end_k; block += BK) {
+ [[unroll]] for (uint l = 0; l < BM; l += loadstride_a) {
+ load_a_to_shmem(pos_a, loadr_a, loadc_a + l, ir * BM + loadc_a + l, block, end_k);
+ }
+ [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
+#if !defined(MUL_MAT_ID)
+ load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic * BN + loadc_b + l, block, end_k);
+#else
+ load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic, _ne1, block, end_k);
+#endif
+ }
+
+ barrier();
+
+ pos_a += BK / LOAD_VEC_A;
+ pos_b += BK / LOAD_VEC_B;
+
+#ifdef COOPMAT
+ [[unroll]] for (uint i = 0; i < BK; i += TK) {
+ [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+ // Load from shared into cache
+ coopMatLoad(cache_a, buf_a, (warp_r * WM + cm_row * TM) * SHMEM_STRIDE + i / 2, SHMEM_STRIDE, gl_CooperativeMatrixLayoutRowMajor);
+
+ [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+ coopMatLoad(cache_b, buf_b, (warp_c * WN + cm_col * TN) * SHMEM_STRIDE + i / 2, SHMEM_STRIDE, gl_CooperativeMatrixLayoutColumnMajor);
+
+ sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a, cache_b, sums[cm_col * cms_per_row + cm_row]);
+ }
+ }
+ }
+#else
+ [[unroll]] for (uint i = 0; i < BK / BK_STEP; i++) {
+ // Load from shared into cache
+ [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
+ [[unroll]] for (uint j = 0; j < TM; j++) {
+ #if defined(DATA_A_F32) || defined(DATA_A_F16)
+ cache_a[wsir * TM + j].xy = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + 2 * i ];
+ cache_a[wsir * TM + j].zw = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + 2 * i + 1];
+ #else
+ cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + i];
+ #endif
+ }
+ }
+
+ [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
+ [[unroll]] for (uint cc = 0; cc < TN; cc++) {
+ #if defined(DATA_A_F32) || defined(DATA_A_F16)
+ cache_b.xy = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + cc) * SHMEM_STRIDE + 2 * i ];
+ cache_b.zw = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + cc) * SHMEM_STRIDE + 2 * i + 1];
+ #else
+ cache_b = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + cc) * SHMEM_STRIDE + i];
+ #endif
+
+ [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
+ [[unroll]] for (uint cr = 0; cr < TM / 2; cr++) {
+ // [WNITER][TN][WMITER][TM / 2] -> [wsic][cc][wsir][cr]
+ const uint sums_idx = (wsic * TN + cc) * WMITER * (TM / 2) + wsir * (TM / 2) + cr;
+ #if defined(DATA_A_F32) || defined(DATA_A_F16)
+ sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].y), ACC_TYPE(cache_b.y),
+ fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].z), ACC_TYPE(cache_b.z), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].w), ACC_TYPE(cache_b.w), sums[sums_idx].x))));
+ sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b.y),
+ fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].z), ACC_TYPE(cache_b.z), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].w), ACC_TYPE(cache_b.w), sums[sums_idx].y))));
+ #else
+ sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].y), ACC_TYPE(cache_b.y), sums[sums_idx].x));
+ sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b.y), sums[sums_idx].y));
+ #endif
+ }
+ }
+ }
+ }
+
+ }
+#endif
+
+ barrier();
+ }
+
+#if defined(ACC_TYPE_MAX)
+#ifdef COOPMAT
+ [[unroll]] for (uint j = 0; j < cms_per_row * cms_per_col; j++) {
+ [[unroll]] for (uint i = 0; i < sums[j].length(); ++i) {
+ sums[j][i] = clamp(sums[j][i], -ACC_TYPE_MAX, ACC_TYPE_MAX);
+ }
+ }
+#else
+ [[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN/2; i++) {
+ sums[i].x = clamp(sums[i].x, -ACC_TYPE_MAX, ACC_TYPE_MAX);
+ sums[i].y = clamp(sums[i].y, -ACC_TYPE_MAX, ACC_TYPE_MAX);
+ }
+#endif
+#endif
+
+ const uint dr = ir * BM + warp_r * WM;
+ const uint dc = ic * BN + warp_c * WN;
+
+#ifndef MUL_MAT_ID
+ const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z;
+#endif
+
+#ifdef COOPMAT
+#ifdef MUL_MAT_ID
+ [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+ [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+ coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
+
+ [[unroll]] for (uint col = 0; col < TN; col += storestride) {
+ const uint row_i = dc + cm_col * TN + col + store_c;
+ if (row_i >= _ne1) break;
+
+ const u16vec2 row_idx = row_ids[row_i - ic * BN];
+
+ if (dr + cm_row * TM + store_r < p.M) {
+ data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
+ }
+ }
+ }
+ }
+#else
+ const bool is_aligned = p.stride_d % 4 == 0; // Assumption: D_TYPE == float
+
+ [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) {
+ [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
+ const bool is_in_bounds = dr + (cm_row + 1) * TM <= p.M && dc + (cm_col + 1) * TN <= p.N;
+
+ if (is_aligned && is_in_bounds) {
+ // Full coopMat is within bounds and stride_d is aligned with 16B
+ coopmat<D_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator> cm_dtype = coopmat<D_TYPE, gl_ScopeSubgroup, TM, TN, gl_MatrixUseAccumulator>(sums[cm_col * cms_per_row + cm_row]);
+ coopMatStore(cm_dtype, data_d, offsets + (dc + cm_col * TN) * p.stride_d + dr + cm_row * TM, p.stride_d, gl_CooperativeMatrixLayoutColumnMajor);
+ } else if (is_in_bounds) {
+ // Full coopMat is within bounds, but stride_d is not aligned
+ coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
+
+ [[unroll]] for (uint col = 0; col < TN; col += storestride) {
+ data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
+ }
+ } else if (dr + cm_row * TM < p.M && dc + cm_col * TN < p.N) {
+ // Partial coopMat is within bounds
+ coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
+
+ [[unroll]] for (uint col = 0; col < TN; col += storestride) {
+ if (dr + cm_row * TM + store_r < p.M && dc + cm_col * TN + col + store_c < p.N) {
+ data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]);
+ }
+ }
+ }
+ }
+ }
+#endif // MUL_MAT_ID
+#else
+ [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
+ [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
+
+ const uint dr_warp = dr + wsir * WSUBM + tiwr * TM;
+ const uint dc_warp = dc + wsic * WSUBN + tiwc * TN;
+ [[unroll]] for (uint cc = 0; cc < TN; cc++) {
+#ifdef MUL_MAT_ID
+ const uint row_i = dc_warp + cc;
+ if (row_i >= _ne1) break;
+
+ const u16vec2 row_idx = row_ids[row_i - ic * BN];
+#endif // MUL_MAT_ID
+ [[unroll]] for (uint cr = 0; cr < TM / 2; cr++) {
+ const uint sums_idx = (wsic * TN + cc) * WMITER * (TM / 2) + wsir * (TM / 2) + cr;
+#ifdef MUL_MAT_ID
+ if (dr_warp + 2 * cr < p.M) {
+ data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + 2 * cr] = D_TYPE(sums[sums_idx].x);
+ }
+ if (dr_warp + 2 * cr + 1 < p.M) {
+ data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + 2 * cr + 1] = D_TYPE(sums[sums_idx].y);
+ }
+#else
+ if (dr_warp + 2 * cr < p.M && dc_warp + cc < p.N) {
+ data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + 2 * cr] = D_TYPE(sums[sums_idx].x);
+ }
+ if (dr_warp + 2 * cr + 1 < p.M && dc_warp + cc < p.N) {
+ data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + 2 * cr + 1] = D_TYPE(sums[sums_idx].y);
+ }
+#endif // MUL_MAT_ID
+ }
+ }
+ }
+ }
+#endif // COOPMAT
+}