diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-sycl/cpy.hpp | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/ggml/src/ggml-sycl/cpy.hpp')
| -rw-r--r-- | llama.cpp/ggml/src/ggml-sycl/cpy.hpp | 223 |
1 files changed, 223 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-sycl/cpy.hpp b/llama.cpp/ggml/src/ggml-sycl/cpy.hpp new file mode 100644 index 0000000..3c331f1 --- /dev/null +++ b/llama.cpp/ggml/src/ggml-sycl/cpy.hpp | |||
| @@ -0,0 +1,223 @@ | |||
| 1 | #ifndef GGML_SYCL_CPY_HPP | ||
| 2 | #define GGML_SYCL_CPY_HPP | ||
| 3 | |||
| 4 | #include "common.hpp" | ||
| 5 | #include <float.h> | ||
| 6 | |||
| 7 | typedef void (*cpy_kernel_t)(const char * cx, char * cdst); | ||
| 8 | |||
| 9 | __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) { | ||
| 10 | if (x <= val[0]) { | ||
| 11 | return 0; | ||
| 12 | } | ||
| 13 | if (x >= val[n - 1]) { | ||
| 14 | return n - 1; | ||
| 15 | } | ||
| 16 | int ml = 0, mu = n - 1; | ||
| 17 | while (mu - ml > 1) { | ||
| 18 | int mav = (ml + mu) / 2; | ||
| 19 | if (x < val[mav]) { | ||
| 20 | mu = mav; | ||
| 21 | } else { | ||
| 22 | ml = mav; | ||
| 23 | } | ||
| 24 | } | ||
| 25 | return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu; | ||
| 26 | } | ||
| 27 | |||
| 28 | inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { | ||
| 29 | const float * xi = (const float *) cxi; | ||
| 30 | block_q8_0 * dsti = (block_q8_0 *) cdsti; | ||
| 31 | |||
| 32 | float amax = 0.0f; // absolute max | ||
| 33 | |||
| 34 | for (int j = 0; j < QK8_0; j++) { | ||
| 35 | const float v = xi[j]; | ||
| 36 | amax = sycl::fmax(amax, sycl::fabs((float) v)); | ||
| 37 | } | ||
| 38 | |||
| 39 | const float d = amax / ((1 << 7) - 1); | ||
| 40 | const float id = d ? 1.0f / d : 0.0f; | ||
| 41 | |||
| 42 | dsti->d = d; | ||
| 43 | |||
| 44 | for (int j = 0; j < QK8_0; ++j) { | ||
| 45 | const float x0 = xi[j] * id; | ||
| 46 | |||
| 47 | dsti->qs[j] = sycl::round((float) x0); | ||
| 48 | } | ||
| 49 | } | ||
| 50 | |||
| 51 | inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { | ||
| 52 | const float * xi = (const float *) cxi; | ||
| 53 | block_q4_0 * dsti = (block_q4_0 *) cdsti; | ||
| 54 | |||
| 55 | float amax = 0.0f; | ||
| 56 | float vmax = 0.0f; | ||
| 57 | |||
| 58 | for (int j = 0; j < QK4_0; ++j) { | ||
| 59 | const float v = xi[j]; | ||
| 60 | if (amax < sycl::fabs((float) v)) { | ||
| 61 | amax = sycl::fabs((float) v); | ||
| 62 | vmax = v; | ||
| 63 | } | ||
| 64 | } | ||
| 65 | |||
| 66 | const float d = vmax / -8; | ||
| 67 | const float id = d ? 1.0f / d : 0.0f; | ||
| 68 | |||
| 69 | dsti->d = d; | ||
| 70 | |||
| 71 | for (int j = 0; j < QK4_0 / 2; ++j) { | ||
| 72 | const float x0 = xi[0 + j] * id; | ||
| 73 | const float x1 = xi[QK4_0 / 2 + j] * id; | ||
| 74 | |||
| 75 | const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f)); | ||
| 76 | const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f)); | ||
| 77 | |||
| 78 | dsti->qs[j] = xi0; | ||
| 79 | dsti->qs[j] |= xi1 << 4; | ||
| 80 | } | ||
| 81 | } | ||
| 82 | |||
| 83 | inline void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { | ||
| 84 | const float * xi = (const float *) cxi; | ||
| 85 | block_q4_1 * dsti = (block_q4_1 *) cdsti; | ||
| 86 | |||
| 87 | float vmin = FLT_MAX; | ||
| 88 | float vmax = -FLT_MAX; | ||
| 89 | |||
| 90 | for (int j = 0; j < QK4_1; ++j) { | ||
| 91 | const float v = xi[j]; | ||
| 92 | |||
| 93 | vmin = sycl::min(v, vmin); | ||
| 94 | vmax = sycl::max(v, vmax); | ||
| 95 | } | ||
| 96 | |||
| 97 | const float d = (vmax - vmin) / ((1 << 4) - 1); | ||
| 98 | const float id = d ? 1.0f / d : 0.0f; | ||
| 99 | |||
| 100 | dsti->dm.x() = d; | ||
| 101 | dsti->dm.y() = vmin; | ||
| 102 | |||
| 103 | for (int j = 0; j < QK4_1 / 2; ++j) { | ||
| 104 | const float x0 = (xi[0 + j] - vmin) * id; | ||
| 105 | const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id; | ||
| 106 | |||
| 107 | const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f)); | ||
| 108 | const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f)); | ||
| 109 | |||
| 110 | dsti->qs[j] = xi0; | ||
| 111 | dsti->qs[j] |= xi1 << 4; | ||
| 112 | } | ||
| 113 | } | ||
| 114 | |||
| 115 | inline void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) { | ||
| 116 | const float * xi = (const float *) cxi; | ||
| 117 | block_q5_0 * dsti = (block_q5_0 *) cdsti; | ||
| 118 | |||
| 119 | float amax = 0.0f; | ||
| 120 | float vmax = 0.0f; | ||
| 121 | |||
| 122 | for (int j = 0; j < QK5_0; ++j) { | ||
| 123 | const float v = xi[j]; | ||
| 124 | if (amax < sycl::fabs((float) v)) { | ||
| 125 | amax = sycl::fabs((float) v); | ||
| 126 | vmax = v; | ||
| 127 | } | ||
| 128 | } | ||
| 129 | |||
| 130 | const float d = vmax / -16; | ||
| 131 | const float id = d ? 1.0f / d : 0.0f; | ||
| 132 | |||
| 133 | dsti->d = d; | ||
| 134 | |||
| 135 | uint32_t qh = 0; | ||
| 136 | for (int j = 0; j < QK5_0 / 2; ++j) { | ||
| 137 | const float x0 = xi[0 + j] * id; | ||
| 138 | const float x1 = xi[QK5_0 / 2 + j] * id; | ||
| 139 | |||
| 140 | const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f)); | ||
| 141 | const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f)); | ||
| 142 | |||
| 143 | dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); | ||
| 144 | qh |= ((xi0 & 0x10u) >> 4) << (j + 0); | ||
| 145 | qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2); | ||
| 146 | } | ||
| 147 | memcpy(dsti->qh, &qh, sizeof(qh)); | ||
| 148 | } | ||
| 149 | |||
| 150 | inline void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) { | ||
| 151 | const float * xi = (const float *) cxi; | ||
| 152 | block_q5_1 * dsti = (block_q5_1 *) cdsti; | ||
| 153 | |||
| 154 | float min = xi[0]; | ||
| 155 | float max = xi[0]; | ||
| 156 | |||
| 157 | for (int j = 1; j < QK5_1; ++j) { | ||
| 158 | const float v = xi[j]; | ||
| 159 | min = v < min ? v : min; | ||
| 160 | max = v > max ? v : max; | ||
| 161 | } | ||
| 162 | |||
| 163 | const float d = (max - min) / 31; | ||
| 164 | const float id = d ? 1.0f / d : 0.0f; | ||
| 165 | |||
| 166 | dsti->dm.x() = d; | ||
| 167 | dsti->dm.y() = min; | ||
| 168 | |||
| 169 | uint32_t qh = 0; | ||
| 170 | for (int j = 0; j < QK5_1 / 2; ++j) { | ||
| 171 | const float x0 = (xi[0 + j] - min) * id; | ||
| 172 | const float x1 = (xi[QK5_1 / 2 + j] - min) * id; | ||
| 173 | |||
| 174 | const uint8_t xi0 = (uint8_t) (x0 + 0.5f); | ||
| 175 | const uint8_t xi1 = (uint8_t) (x1 + 0.5f); | ||
| 176 | |||
| 177 | dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); | ||
| 178 | qh |= ((xi0 & 0x10u) >> 4) << (j + 0); | ||
| 179 | qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2); | ||
| 180 | } | ||
| 181 | memcpy(dsti->qh, &qh, sizeof(qh)); | ||
| 182 | } | ||
| 183 | |||
| 184 | inline void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { | ||
| 185 | const float * xi = (const float *) cxi; | ||
| 186 | block_iq4_nl * dsti = (block_iq4_nl *) cdsti; | ||
| 187 | |||
| 188 | float amax = 0.0f; | ||
| 189 | float vmax = 0.0f; | ||
| 190 | |||
| 191 | for (int j = 0; j < QK4_NL; ++j) { | ||
| 192 | const float v = xi[j]; | ||
| 193 | if (amax < sycl::fabs((float) v)) { | ||
| 194 | amax = sycl::fabs((float) v); | ||
| 195 | vmax = v; | ||
| 196 | } | ||
| 197 | } | ||
| 198 | |||
| 199 | float d = vmax / kvalues_iq4nl[0]; | ||
| 200 | const float id = d ? 1.0f / d : 0.0f; | ||
| 201 | |||
| 202 | float sumqx = 0, sumq2 = 0; | ||
| 203 | for (int j = 0; j < QK4_NL / 2; ++j) { | ||
| 204 | const float x0 = xi[0 + j] * id; | ||
| 205 | const float x1 = xi[QK4_NL / 2 + j] * id; | ||
| 206 | const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0); | ||
| 207 | const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1); | ||
| 208 | dsti->qs[j] = xi0 | (xi1 << 4); | ||
| 209 | const float v0 = kvalues_iq4nl[xi0]; | ||
| 210 | const float v1 = kvalues_iq4nl[xi1]; | ||
| 211 | const float w0 = xi[0 + j] * xi[0 + j]; | ||
| 212 | const float w1 = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j]; | ||
| 213 | sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j]; | ||
| 214 | sumq2 += w0 * v0 * v0 + w1 * v1 * v1; | ||
| 215 | } | ||
| 216 | |||
| 217 | dsti->d = sumq2 > 0 ? sumqx / sumq2 : d; | ||
| 218 | } | ||
| 219 | |||
| 220 | void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1); | ||
| 221 | void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst); | ||
| 222 | |||
| 223 | #endif // GGML_SYCL_CPY_HPP | ||
