summaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl')
-rw-r--r--llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl86
1 files changed, 86 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl b/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl
new file mode 100644
index 0000000..af4adc7
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl
@@ -0,0 +1,86 @@
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#ifdef cl_intel_subgroups
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#endif
+
+#ifdef cl_intel_required_subgroup_size
+#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
+#define INTEL_GPU 1
+#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
+#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
+#elif defined(cl_qcom_reqd_sub_group_size)
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
+
+enum ggml_sort_order {
+ GGML_SORT_ORDER_ASC,
+ GGML_SORT_ORDER_DESC,
+};
+
+kernel void kernel_argsort_f32_i32(
+ global float * src0,
+ ulong offset0,
+ global int * dst,
+ ulong offsetd,
+ const int ne00,
+ const int ne00_pad,
+ const int order,
+ local int * dst_row
+) {
+ // bitonic sort
+ int col = get_local_id(0);
+ int row = get_group_id(1);
+
+ if (col >= ne00_pad) {
+ return;
+ }
+
+ src0 = (global char *)((global char *)src0 + offset0);
+ dst = (global float *)((global char *)dst + offsetd);
+
+ global float * x_row = src0 + row * ne00;
+
+ // initialize indices
+ dst_row[col] = col;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (int k = 2; k <= ne00_pad; k *= 2) {
+ for (int j = k / 2; j > 0; j /= 2) {
+ int ixj = col ^ j;
+ if (ixj > col) {
+ if ((col & k) == 0) {
+ if (dst_row[col] >= ne00 ||
+ (dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
+ x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+ x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+ ) {
+ SWAP(dst_row[col], dst_row[ixj], int);
+ }
+ } else {
+ if (dst_row[ixj] >= ne00 ||
+ (dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
+ x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+ x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+ ) {
+ SWAP(dst_row[col], dst_row[ixj], int);
+ }
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ }
+
+ // copy the result to dst without the padding
+ if (col < ne00) {
+ dst[row * ne00 + col] = dst_row[col];
+ }
+}