summaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh')
-rw-r--r--llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh39
1 files changed, 39 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh b/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh
new file mode 100644
index 0000000..de240fd
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh
@@ -0,0 +1,39 @@
+#include "common.cuh"
+
+// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
+template <bool norm>
+static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
+ const int row = blockIdx.x;
+ const int col = threadIdx.x;
+
+ float sum = 0.0f;
+ const int num_unroll = 8;
+ float temp[num_unroll];
+ float sum_temp[num_unroll] = { 0.0f };
+ for (int i = col; i < ncols;) {
+ for (int j = 0; j < num_unroll; ++j) {
+ if (i < ncols) {
+ temp[j] = x[row * ncols + i];
+ } else {
+ temp[j] = 0;
+ }
+ i += blockDim.x;
+ }
+ for (int j = 0; j < num_unroll; ++j) {
+ sum_temp[j] += temp[j];
+ }
+ }
+ for (int j = 0; j < num_unroll; ++j) {
+ sum += sum_temp[j];
+ }
+
+ // sum up partial sums
+ __shared__ float shared_vals[32];
+ sum = block_reduce<block_reduce_method::SUM>(sum, shared_vals);
+
+ if (col != 0) {
+ return;
+ }
+
+ dst[row] = norm ? sum / ncols : sum;
+}