diff options
Diffstat (limited to 'llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh')
| -rw-r--r-- | llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh | 39 |
1 files changed, 39 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh b/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh new file mode 100644 index 0000000..de240fd --- /dev/null +++ b/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh @@ -0,0 +1,39 @@ +#include "common.cuh" + +// Row reduction kernel template - compute sum (norm=false) or mean (norm=true) +template <bool norm> +static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) { + const int row = blockIdx.x; + const int col = threadIdx.x; + + float sum = 0.0f; + const int num_unroll = 8; + float temp[num_unroll]; + float sum_temp[num_unroll] = { 0.0f }; + for (int i = col; i < ncols;) { + for (int j = 0; j < num_unroll; ++j) { + if (i < ncols) { + temp[j] = x[row * ncols + i]; + } else { + temp[j] = 0; + } + i += blockDim.x; + } + for (int j = 0; j < num_unroll; ++j) { + sum_temp[j] += temp[j]; + } + } + for (int j = 0; j < num_unroll; ++j) { + sum += sum_temp[j]; + } + + // sum up partial sums + __shared__ float shared_vals[32]; + sum = block_reduce<block_reduce_method::SUM>(sum, shared_vals); + + if (col != 0) { + return; + } + + dst[row] = norm ? sum / ncols : sum; +} |
