llmnpc - llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl

Path: llmnpc / llama.cpp / ggml / src / ggml-webgpu / wgsl-shaders / cumsum.wgsl (raw)
 1@group(0) @binding(0)
 2var<storage, read_write> src: array<f32>;
 3
 4@group(0) @binding(1)
 5var<storage, read_write> dst: array<f32>;
 6
 7struct Params {
 8    offset_src: u32, // in elements
 9    offset_dst: u32, // in elements
10    ne0: u32,
11};
12
13@group(0) @binding(2)
14var<uniform> params: Params;
15
16var<workgroup> shared_sum: array<f32, WG_SIZE>;
17
18@compute @workgroup_size(WG_SIZE)
19fn main(@builtin(workgroup_id) wid: vec3<u32>,
20        @builtin(local_invocation_id) lid: vec3<u32>) {
21    let row_idx = params.offset_src + wid.x * params.ne0;
22    let elems = (params.ne0 + WG_SIZE - 1) / WG_SIZE;
23    var local_sum: f32 = 0.0;
24    for (var col = lid.x * elems; col < (lid.x + 1) * elems && col < params.ne0; col ++) {
25        local_sum += src[row_idx + col];
26    }
27    shared_sum[lid.x] = local_sum;
28    workgroupBarrier();
29
30    // upsweep
31    var offset = 1u;
32    while (offset < WG_SIZE) {
33        let idx = (lid.x + 1) * offset * 2 - 1;
34        if (idx < WG_SIZE) {
35            shared_sum[idx] = shared_sum[idx] + shared_sum[idx - offset];
36        }
37        workgroupBarrier();
38        offset <<= 1;
39    }
40
41    // set last to 0 for exclusive sum
42    if (lid.x == 0) {
43        shared_sum[WG_SIZE - 1] = 0.0;
44    }
45    workgroupBarrier();
46
47    // downsweep
48    offset = WG_SIZE >> 1;
49    while (offset > 0) {
50        let idx = (lid.x + 1) * offset * 2 - 1;
51        if (idx < WG_SIZE) {
52            let t = shared_sum[idx - offset];
53            shared_sum[idx - offset] = shared_sum[idx];
54            shared_sum[idx] = shared_sum[idx] + t;
55        }
56        workgroupBarrier();
57        offset = offset >> 1;
58    }
59
60    // shared_sum[lid] is exclusive prefix sum up to this thread.
61    var running_sum = shared_sum[lid.x];
62    for (var col = lid.x * elems; col < (lid.x + 1) * elems && col < params.ne0; col ++) {
63        running_sum += src[row_idx + col];
64        dst[params.offset_dst + wid.x * params.ne0 + col] = running_sum;
65    }
66}