diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-cuda/unary.cuh | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/ggml/src/ggml-cuda/unary.cuh')
| -rw-r--r-- | llama.cpp/ggml/src/ggml-cuda/unary.cuh | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-cuda/unary.cuh b/llama.cpp/ggml/src/ggml-cuda/unary.cuh new file mode 100644 index 0000000..609046e --- /dev/null +++ b/llama.cpp/ggml/src/ggml-cuda/unary.cuh | |||
| @@ -0,0 +1,110 @@ | |||
| 1 | #pragma once | ||
| 2 | #include "common.cuh" | ||
| 3 | |||
| 4 | #define CUDA_NEG_BLOCK_SIZE 256 | ||
| 5 | #define CUDA_STEP_BLOCK_SIZE 256 | ||
| 6 | #define CUDA_GELU_BLOCK_SIZE 256 | ||
| 7 | #define CUDA_SILU_BLOCK_SIZE 256 | ||
| 8 | #define CUDA_SILU_BACK_BLOCK_SIZE 256 | ||
| 9 | #define CUDA_TANH_BLOCK_SIZE 256 | ||
| 10 | #define CUDA_RELU_BLOCK_SIZE 256 | ||
| 11 | #define CUDA_SIGMOID_BLOCK_SIZE 256 | ||
| 12 | #define CUDA_HARDSIGMOID_BLOCK_SIZE 256 | ||
| 13 | #define CUDA_EXP_BLOCK_SIZE 256 | ||
| 14 | #define CUDA_HARDSWISH_BLOCK_SIZE 256 | ||
| 15 | #define CUDA_SQR_BLOCK_SIZE 256 | ||
| 16 | #define CUDA_SQRT_BLOCK_SIZE 256 | ||
| 17 | #define CUDA_SIN_BLOCK_SIZE 256 | ||
| 18 | #define CUDA_COS_BLOCK_SIZE 256 | ||
| 19 | #define CUDA_GLU_BLOCK_SIZE 256 | ||
| 20 | #define CUDA_XIELU_BLOCK_SIZE 256 | ||
| 21 | |||
| 22 | void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 23 | |||
| 24 | void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 25 | |||
| 26 | void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 27 | |||
| 28 | void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 29 | |||
| 30 | void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 31 | |||
| 32 | void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 33 | |||
| 34 | void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 35 | |||
| 36 | void ggml_cuda_op_gelu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 37 | |||
| 38 | void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 39 | |||
| 40 | void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 41 | |||
| 42 | void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 43 | |||
| 44 | void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 45 | |||
| 46 | void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 47 | |||
| 48 | void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 49 | |||
| 50 | void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 51 | |||
| 52 | void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 53 | |||
| 54 | void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 55 | |||
| 56 | void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 57 | |||
| 58 | void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 59 | |||
| 60 | void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 61 | |||
| 62 | void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 63 | |||
| 64 | void ggml_cuda_op_expm1(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 65 | |||
| 66 | void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 67 | |||
| 68 | void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 69 | |||
| 70 | void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 71 | |||
| 72 | void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 73 | |||
| 74 | void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 75 | |||
| 76 | void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 77 | |||
| 78 | void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 79 | |||
| 80 | void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 81 | |||
| 82 | void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 83 | |||
| 84 | void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 85 | |||
| 86 | void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 87 | |||
| 88 | void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 89 | |||
| 90 | void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||
| 91 | |||
| 92 | __device__ __forceinline__ float ggml_cuda_op_silu_single(float x) { | ||
| 93 | return x / (1.0f + expf(-x)); | ||
| 94 | } | ||
| 95 | |||
| 96 | __device__ __forceinline__ float ggml_cuda_op_gelu_single(float x) { | ||
| 97 | const float GELU_COEF_A = 0.044715f; | ||
| 98 | const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; | ||
| 99 | |||
| 100 | return 0.5f * x * (1.0f + tanhf(SQRT_2_OVER_PI * x * (1.0f + GELU_COEF_A * x * x))); | ||
| 101 | } | ||
| 102 | |||
| 103 | __device__ __forceinline__ float ggml_cuda_op_swiglu_oai_single(float x, float g, float alpha = 1.702f, float limit = 7.0f) { | ||
| 104 | x = fminf(x, limit); | ||
| 105 | g = fmaxf(fminf(g, limit), -limit); | ||
| 106 | |||
| 107 | float out_glu = x / (1.0f + expf(-x * alpha)); | ||
| 108 | out_glu = out_glu * (1.0f + g); | ||
| 109 | return out_glu; | ||
| 110 | } | ||
