@group(0) @binding(0) var src: array; @group(0) @binding(1) var dst: array; struct Params { offset_src: u32, // in elements offset_dst: u32, // in elements ne0: u32, }; @group(0) @binding(2) var params: Params; var shared_sum: array; @compute @workgroup_size(WG_SIZE) fn main(@builtin(workgroup_id) wid: vec3, @builtin(local_invocation_id) lid: vec3) { let row_idx = params.offset_src + wid.x * params.ne0; let elems = (params.ne0 + WG_SIZE - 1) / WG_SIZE; var local_sum: f32 = 0.0; for (var col = lid.x * elems; col < (lid.x + 1) * elems && col < params.ne0; col ++) { local_sum += src[row_idx + col]; } shared_sum[lid.x] = local_sum; workgroupBarrier(); // upsweep var offset = 1u; while (offset < WG_SIZE) { let idx = (lid.x + 1) * offset * 2 - 1; if (idx < WG_SIZE) { shared_sum[idx] = shared_sum[idx] + shared_sum[idx - offset]; } workgroupBarrier(); offset <<= 1; } // set last to 0 for exclusive sum if (lid.x == 0) { shared_sum[WG_SIZE - 1] = 0.0; } workgroupBarrier(); // downsweep offset = WG_SIZE >> 1; while (offset > 0) { let idx = (lid.x + 1) * offset * 2 - 1; if (idx < WG_SIZE) { let t = shared_sum[idx - offset]; shared_sum[idx - offset] = shared_sum[idx]; shared_sum[idx] = shared_sum[idx] + t; } workgroupBarrier(); offset = offset >> 1; } // shared_sum[lid] is exclusive prefix sum up to this thread. var running_sum = shared_sum[lid.x]; for (var col = lid.x * elems; col < (lid.x + 1) * elems && col < params.ne0; col ++) { running_sum += src[row_idx + col]; dst[params.offset_dst + wid.x * params.ne0 + col] = running_sum; } }