From b333b06772c89d96aacb5490d6a219fba7c09cc6 Mon Sep 17 00:00:00 2001 From: Mitja Felicijan Date: Thu, 12 Feb 2026 20:57:17 +0100 Subject: Engage! --- llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh | 39 ++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh (limited to 'llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh') diff --git a/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh b/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh new file mode 100644 index 0000000..de240fd --- /dev/null +++ b/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh @@ -0,0 +1,39 @@ +#include "common.cuh" + +// Row reduction kernel template - compute sum (norm=false) or mean (norm=true) +template +static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) { + const int row = blockIdx.x; + const int col = threadIdx.x; + + float sum = 0.0f; + const int num_unroll = 8; + float temp[num_unroll]; + float sum_temp[num_unroll] = { 0.0f }; + for (int i = col; i < ncols;) { + for (int j = 0; j < num_unroll; ++j) { + if (i < ncols) { + temp[j] = x[row * ncols + i]; + } else { + temp[j] = 0; + } + i += blockDim.x; + } + for (int j = 0; j < num_unroll; ++j) { + sum_temp[j] += temp[j]; + } + } + for (int j = 0; j < num_unroll; ++j) { + sum += sum_temp[j]; + } + + // sum up partial sums + __shared__ float shared_vals[32]; + sum = block_reduce(sum, shared_vals); + + if (col != 0) { + return; + } + + dst[row] = norm ? sum / ncols : sum; +} -- cgit v1.2.3