Engage!

author: Mitja Felicijan <mitja.felicijan@gmail.com> 2026-02-12 20:57:17 +0100
committer: Mitja Felicijan <mitja.felicijan@gmail.com> 2026-02-12 20:57:17 +0100
commit: b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree: 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-cuda/unary.cuh
download: llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
1 files changed, 110 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-cuda/unary.cuh b/llama.cpp/ggml/src/ggml-cuda/unary.cuh
new file mode 100644
index 0000000..609046e
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cuda/unary.cuh
@@ -0,0 +1,110 @@
+#pragma once
+#include "common.cuh"
+#define CUDA_NEG_BLOCK_SIZE 256
+#define CUDA_STEP_BLOCK_SIZE 256
+#define CUDA_GELU_BLOCK_SIZE 256
+#define CUDA_SILU_BLOCK_SIZE 256
+#define CUDA_SILU_BACK_BLOCK_SIZE 256
+#define CUDA_TANH_BLOCK_SIZE 256
+#define CUDA_RELU_BLOCK_SIZE 256
+#define CUDA_SIGMOID_BLOCK_SIZE 256
+#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
+#define CUDA_EXP_BLOCK_SIZE 256
+#define CUDA_HARDSWISH_BLOCK_SIZE 256
+#define CUDA_SQR_BLOCK_SIZE 256
+#define CUDA_SQRT_BLOCK_SIZE 256
+#define CUDA_SIN_BLOCK_SIZE 256
+#define CUDA_COS_BLOCK_SIZE 256
+#define CUDA_GLU_BLOCK_SIZE 256
+#define CUDA_XIELU_BLOCK_SIZE 256
+void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_expm1(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+__device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
+    return x / (1.0f + expf(-x));
+}
+__device__ __forceinline__ float ggml_cuda_op_gelu_single(float x) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    return 0.5f * x * (1.0f + tanhf(SQRT_2_OVER_PI * x * (1.0f + GELU_COEF_A * x * x)));
+}
+__device__ __forceinline__ float ggml_cuda_op_swiglu_oai_single(float x, float g, float alpha = 1.702f, float limit = 7.0f) {
+    x = fminf(x, limit);
+    g = fmaxf(fminf(g, limit), -limit);
+    float out_glu = x / (1.0f + expf(-x * alpha));
+    out_glu = out_glu * (1.0f + g);
+    return out_glu;
+}
author	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-02-12 20:57:17 +0100
committer	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-02-12 20:57:17 +0100
commit	b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree	211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-cuda/unary.cuh
download	llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz

diff --git a/llama.cpp/ggml/src/ggml-cuda/unary.cuh b/llama.cpp/ggml/src/ggml-cuda/unary.cuh new file mode 100644 index 0000000..609046e --- /dev/null +++ b/llama.cpp/ggml/src/ggml-cuda/unary.cuh
@@ -0,0 +1,110 @@
	1	#pragma once
	2	#include "common.cuh"
	3
	4	#define CUDA_NEG_BLOCK_SIZE 256
	5	#define CUDA_STEP_BLOCK_SIZE 256
	6	#define CUDA_GELU_BLOCK_SIZE 256
	7	#define CUDA_SILU_BLOCK_SIZE 256
	8	#define CUDA_SILU_BACK_BLOCK_SIZE 256
	9	#define CUDA_TANH_BLOCK_SIZE 256
	10	#define CUDA_RELU_BLOCK_SIZE 256
	11	#define CUDA_SIGMOID_BLOCK_SIZE 256
	12	#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
	13	#define CUDA_EXP_BLOCK_SIZE 256
	14	#define CUDA_HARDSWISH_BLOCK_SIZE 256
	15	#define CUDA_SQR_BLOCK_SIZE 256
	16	#define CUDA_SQRT_BLOCK_SIZE 256
	17	#define CUDA_SIN_BLOCK_SIZE 256
	18	#define CUDA_COS_BLOCK_SIZE 256
	19	#define CUDA_GLU_BLOCK_SIZE 256
	20	#define CUDA_XIELU_BLOCK_SIZE 256
	21
	22	void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	23
	24	void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	25
	26	void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	27
	28	void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	29
	30	void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	31
	32	void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	33
	34	void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	35
	36	void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	37
	38	void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	39
	40	void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	41
	42	void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	43
	44	void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	45
	46	void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	47
	48	void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	49
	50	void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	51
	52	void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	53
	54	void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	55
	56	void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	57
	58	void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	59
	60	void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	61
	62	void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	63
	64	void ggml_cuda_op_expm1(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	65
	66	void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	67
	68	void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	69
	70	void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	71
	72	void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	73
	74	void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	75
	76	void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	77
	78	void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	79
	80	void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	81
	82	void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	83
	84	void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	85
	86	void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	87
	88	void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	89
	90	void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
	91
	92	__device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
	93	return x / (1.0f + expf(-x));
	94	}
	95
	96	__device__ __forceinline__ float ggml_cuda_op_gelu_single(float x) {
	97	const float GELU_COEF_A = 0.044715f;
	98	const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
	99
	100	return 0.5f * x * (1.0f + tanhf(SQRT_2_OVER_PI * x * (1.0f + GELU_COEF_A * x * x)));
	101	}
	102
	103	__device__ __forceinline__ float ggml_cuda_op_swiglu_oai_single(float x, float g, float alpha = 1.702f, float limit = 7.0f) {
	104	x = fminf(x, limit);
	105	g = fmaxf(fminf(g, limit), -limit);
	106
	107	float out_glu = x / (1.0f + expf(-x * alpha));
	108	out_glu = out_glu * (1.0f + g);
	109	return out_glu;
	110	}