diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-cpu/arch/s390 | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/ggml/src/ggml-cpu/arch/s390')
| -rw-r--r-- | llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp | 50 | ||||
| -rw-r--r-- | llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c | 1468 |
2 files changed, 1518 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp b/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp new file mode 100644 index 0000000..5f4405a --- /dev/null +++ b/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp | |||
| @@ -0,0 +1,50 @@ | |||
| 1 | #include "ggml-backend-impl.h" | ||
| 2 | |||
| 3 | #if defined(__s390x__) | ||
| 4 | #include <sys/auxv.h> | ||
| 5 | |||
| 6 | // find hwcap bits in asm/elf.h | ||
| 7 | #ifndef HWCAP_VXRS_EXT2 | ||
| 8 | #define HWCAP_VXRS_EXT2 (1 << 15) | ||
| 9 | #endif | ||
| 10 | |||
| 11 | #ifndef HWCAP_NNPA | ||
| 12 | #define HWCAP_NNPA (1 << 20) | ||
| 13 | #endif | ||
| 14 | |||
| 15 | struct s390x_features { | ||
| 16 | bool has_vxe2 = false; | ||
| 17 | bool has_nnpa = false; | ||
| 18 | |||
| 19 | s390x_features() { | ||
| 20 | uint32_t hwcap = getauxval(AT_HWCAP); | ||
| 21 | // NOTE: use hwcap2 with DFLT for z17 and later | ||
| 22 | // uint32_t hwcap2 = getauxval(AT_HWCAP2); | ||
| 23 | |||
| 24 | has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2); | ||
| 25 | has_nnpa = !!(hwcap & HWCAP_NNPA); | ||
| 26 | } | ||
| 27 | }; | ||
| 28 | |||
| 29 | static int ggml_backend_cpu_s390x_score() { | ||
| 30 | int score = 1; | ||
| 31 | s390x_features sf; | ||
| 32 | |||
| 33 | // IBM z15 / LinuxONE 3 | ||
| 34 | #ifdef GGML_USE_VXE2 | ||
| 35 | if (!sf.has_vxe2) { return 0; } | ||
| 36 | score += 1 << 1; | ||
| 37 | #endif | ||
| 38 | |||
| 39 | // IBM z16 / LinuxONE 4 and z17 / LinuxONE 5 | ||
| 40 | #ifdef GGML_USE_NNPA | ||
| 41 | if (!sf.has_nnpa) { return 0; } | ||
| 42 | score += 1 << 2; | ||
| 43 | #endif | ||
| 44 | |||
| 45 | return score; | ||
| 46 | } | ||
| 47 | |||
| 48 | GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score) | ||
| 49 | |||
| 50 | #endif // __s390x__ | ||
diff --git a/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c b/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c new file mode 100644 index 0000000..19d225a --- /dev/null +++ b/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c | |||
| @@ -0,0 +1,1468 @@ | |||
| 1 | #define GGML_COMMON_IMPL_C | ||
| 2 | #include "ggml-common.h" | ||
| 3 | #include "ggml-quants.h" | ||
| 4 | #include "ggml-impl.h" | ||
| 5 | #include "ggml-cpu.h" | ||
| 6 | #include "simd-mappings.h" | ||
| 7 | |||
| 8 | #include "../../quants.h" | ||
| 9 | #include "../../ggml-cpu-impl.h" | ||
| 10 | |||
| 11 | #include <math.h> | ||
| 12 | #include <string.h> | ||
| 13 | #include <assert.h> | ||
| 14 | #include <float.h> | ||
| 15 | #include <stdlib.h> // for qsort | ||
| 16 | #include <stdio.h> // for GGML_ASSERT | ||
| 17 | |||
| 18 | #define GROUP_MAX_EPS 1e-15f | ||
| 19 | #define GROUP_MAX_EPS_IQ3_XXS 1e-8f | ||
| 20 | #define GROUP_MAX_EPS_IQ2_S 1e-8f | ||
| 21 | #define GROUP_MAX_EPS_IQ1_M 1e-7f | ||
| 22 | #define GROUP_MAX_EPS_IQ1_S 1e-12f | ||
| 23 | |||
| 24 | #define UNUSED GGML_UNUSED | ||
| 25 | |||
| 26 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 27 | #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s | ||
| 28 | #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) | ||
| 29 | #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) | ||
| 30 | #define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) | ||
| 31 | #define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) | ||
| 32 | #define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) | ||
| 33 | #define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) | ||
| 34 | #define B8(c,s ) B7(c,s, c), B7(c,s, s) | ||
| 35 | |||
| 36 | // precomputed tables for expanding 8bits to 8 bytes: | ||
| 37 | static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4 | ||
| 38 | static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 | ||
| 39 | |||
| 40 | // permute mask for byteswapping | ||
| 41 | static const uint8x16_t v_kperm = (const uint8x16_t){ | ||
| 42 | 7, 6, 5, 4, 3, 2, 1, 0, | ||
| 43 | 15, 14, 13, 12, 11, 10, 9, 8 | ||
| 44 | }; | ||
| 45 | #endif | ||
| 46 | |||
| 47 | void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { | ||
| 48 | assert(QK8_0 == 32); | ||
| 49 | assert(k % QK8_0 == 0); | ||
| 50 | const int nb = k / QK8_0; | ||
| 51 | |||
| 52 | block_q8_0 * GGML_RESTRICT y = vy; | ||
| 53 | |||
| 54 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 55 | for (int i = 0; i < nb; i++) { | ||
| 56 | float32x4_t srcv [8]; | ||
| 57 | float32x4_t asrcv[8]; | ||
| 58 | float32x4_t amaxv[8]; | ||
| 59 | |||
| 60 | for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); | ||
| 61 | for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); | ||
| 62 | for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); | ||
| 63 | for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); | ||
| 64 | for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); | ||
| 65 | |||
| 66 | const float amax = MAX(MAX(vec_extract(amaxv[0], 0), | ||
| 67 | vec_extract(amaxv[0], 1)), | ||
| 68 | MAX(vec_extract(amaxv[0], 2), | ||
| 69 | vec_extract(amaxv[0], 3))); | ||
| 70 | |||
| 71 | const float d = amax / ((1 << 7) - 1); | ||
| 72 | const float id = d ? 1.0f / d : 0.0f; | ||
| 73 | |||
| 74 | y[i].d = GGML_CPU_FP32_TO_FP16(d); | ||
| 75 | |||
| 76 | for (int j = 0; j < 8; j++) { | ||
| 77 | const float32x4_t v = vec_mul(srcv[j], vec_splats(id)); | ||
| 78 | /* Uses non-default rounding for vec_signed or vec_round */ | ||
| 79 | const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1)); | ||
| 80 | |||
| 81 | y[i].qs[4*j + 0] = vec_extract(vi, 0); | ||
| 82 | y[i].qs[4*j + 1] = vec_extract(vi, 1); | ||
| 83 | y[i].qs[4*j + 2] = vec_extract(vi, 2); | ||
| 84 | y[i].qs[4*j + 3] = vec_extract(vi, 3); | ||
| 85 | } | ||
| 86 | } | ||
| 87 | #else | ||
| 88 | GGML_UNUSED(nb); | ||
| 89 | // scalar | ||
| 90 | quantize_row_q8_0_ref(x, y, k); | ||
| 91 | #endif | ||
| 92 | } | ||
| 93 | |||
| 94 | void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { | ||
| 95 | assert(k % QK8_1 == 0); | ||
| 96 | const int nb = k / QK8_1; | ||
| 97 | |||
| 98 | block_q8_1 * GGML_RESTRICT y = vy; | ||
| 99 | |||
| 100 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 101 | for (int i = 0; i < nb; i++) { | ||
| 102 | float32x4_t srcv [8]; | ||
| 103 | float32x4_t asrcv[8]; | ||
| 104 | float32x4_t amaxv[8]; | ||
| 105 | |||
| 106 | for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j); | ||
| 107 | for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]); | ||
| 108 | for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]); | ||
| 109 | for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]); | ||
| 110 | for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]); | ||
| 111 | |||
| 112 | const float amax = MAX(MAX(vec_extract(amaxv[0], 0), | ||
| 113 | vec_extract(amaxv[0], 1)), | ||
| 114 | MAX(vec_extract(amaxv[0], 2), | ||
| 115 | vec_extract(amaxv[0], 3))); | ||
| 116 | |||
| 117 | const float d = amax / ((1 << 7) - 1); | ||
| 118 | const float id = d ? 1.0f / d : 0.0f; | ||
| 119 | |||
| 120 | y[i].d = GGML_CPU_FP32_TO_FP16(d); | ||
| 121 | |||
| 122 | int32x4_t acc = vec_splats(0); | ||
| 123 | |||
| 124 | for (int j = 0; j < 8; j++) { | ||
| 125 | const float32x4_t v = vec_mul(srcv[j], vec_splats(id)); | ||
| 126 | /* Uses non-default rounding for vec_signed or vec_round */ | ||
| 127 | const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1)); | ||
| 128 | |||
| 129 | y[i].qs[4*j + 0] = vec_extract(vi, 0); | ||
| 130 | y[i].qs[4*j + 1] = vec_extract(vi, 1); | ||
| 131 | y[i].qs[4*j + 2] = vec_extract(vi, 2); | ||
| 132 | y[i].qs[4*j + 3] = vec_extract(vi, 3); | ||
| 133 | |||
| 134 | acc = vec_add(acc, vi); | ||
| 135 | } | ||
| 136 | |||
| 137 | y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3])); | ||
| 138 | } | ||
| 139 | #else | ||
| 140 | GGML_UNUSED(nb); | ||
| 141 | // scalar | ||
| 142 | quantize_row_q8_1_ref(x, y, k); | ||
| 143 | #endif | ||
| 144 | } | ||
| 145 | |||
| 146 | |||
| 147 | //===================================== Dot products ================================= | ||
| 148 | |||
| 149 | void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 150 | const int qk = QK8_0; | ||
| 151 | const int nb = n / qk; | ||
| 152 | |||
| 153 | assert(n % qk == 0); | ||
| 154 | assert(nrc == 1); | ||
| 155 | UNUSED(nrc); | ||
| 156 | UNUSED(bx); | ||
| 157 | UNUSED(by); | ||
| 158 | UNUSED(bs); | ||
| 159 | |||
| 160 | const block_q4_0 * GGML_RESTRICT x = vx; | ||
| 161 | const block_q8_0 * GGML_RESTRICT y = vy; | ||
| 162 | |||
| 163 | int ib = 0; | ||
| 164 | float sumf = 0; | ||
| 165 | |||
| 166 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 167 | float32x4_t acc = vec_splats(0.0f); | ||
| 168 | |||
| 169 | const uint8x16_t v_m = vec_splats((const uint8_t)0x0F); | ||
| 170 | const int8x16_t v_s = vec_splats( (const int8_t)0x08); | ||
| 171 | |||
| 172 | for (; ib < nb; ++ib) { | ||
| 173 | const uint8x16_t v_x = vec_xl(0, x[ib].qs); | ||
| 174 | const int8x16_t v_xl = (const int8x16_t)(v_x & v_m); | ||
| 175 | const int8x16_t v_xh = (const int8x16_t)(v_x >> 4); | ||
| 176 | |||
| 177 | const int8x16_t v_xls = vec_sub(v_xl, v_s); | ||
| 178 | const int8x16_t v_xhs = vec_sub(v_xh, v_s); | ||
| 179 | |||
| 180 | const int8x16_t v_yl = vec_xl(0 , y[ib].qs); | ||
| 181 | const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs); | ||
| 182 | |||
| 183 | const int16x8_t v_xylso = vec_mulo(v_xls, v_yl); | ||
| 184 | const int16x8_t v_xylse = vec_mule(v_xls, v_yl); | ||
| 185 | const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh); | ||
| 186 | const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh); | ||
| 187 | |||
| 188 | int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_); | ||
| 189 | |||
| 190 | const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_)); | ||
| 191 | const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); | ||
| 192 | |||
| 193 | acc = vec_madd(v_xy, v_d, acc); | ||
| 194 | } | ||
| 195 | |||
| 196 | sumf = vec_hsum_f32x4(acc); | ||
| 197 | *s = sumf; | ||
| 198 | #else | ||
| 199 | UNUSED(nb); | ||
| 200 | UNUSED(x); | ||
| 201 | UNUSED(y); | ||
| 202 | UNUSED(ib); | ||
| 203 | UNUSED(sumf); | ||
| 204 | ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 205 | #endif | ||
| 206 | } | ||
| 207 | |||
| 208 | void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 209 | const int qk = QK8_1; | ||
| 210 | const int nb = n / qk; | ||
| 211 | |||
| 212 | assert(n % qk == 0); | ||
| 213 | assert(nrc == 1); | ||
| 214 | UNUSED(nrc); | ||
| 215 | UNUSED(bx); | ||
| 216 | UNUSED(by); | ||
| 217 | UNUSED(bs); | ||
| 218 | |||
| 219 | const block_q4_1 * GGML_RESTRICT x = vx; | ||
| 220 | const block_q8_1 * GGML_RESTRICT y = vy; | ||
| 221 | |||
| 222 | int ib = 0; | ||
| 223 | float sumf = 0; | ||
| 224 | |||
| 225 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 226 | float summs = 0; | ||
| 227 | float32x4_t acc = vec_splats(0.0f); | ||
| 228 | |||
| 229 | const uint8x16_t v_m = vec_splat_u8(0x0F); | ||
| 230 | |||
| 231 | #pragma GCC unroll 4 | ||
| 232 | for (; ib < nb; ++ib) { | ||
| 233 | __builtin_prefetch(x[ib].qs, 0, 1); | ||
| 234 | __builtin_prefetch(y[ib].qs, 0, 1); | ||
| 235 | |||
| 236 | summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); | ||
| 237 | |||
| 238 | const uint8x16_t v_x = vec_xl(0, x[ib].qs); | ||
| 239 | const int8x16_t v_xl = (const int8x16_t)(v_x & v_m); | ||
| 240 | const int8x16_t v_xh = (const int8x16_t)(v_x >> 4); | ||
| 241 | |||
| 242 | const int8x16_t v_yl = vec_xl(0 , y[ib].qs); | ||
| 243 | const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs); | ||
| 244 | |||
| 245 | const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); | ||
| 246 | const float32x4_t v_xy = vec_float(v_xy_); | ||
| 247 | |||
| 248 | const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); | ||
| 249 | |||
| 250 | acc = vec_madd(v_xy, v_d, acc); | ||
| 251 | } | ||
| 252 | |||
| 253 | sumf = vec_hsum_f32x4(acc) + summs; | ||
| 254 | *s = sumf; | ||
| 255 | #else | ||
| 256 | UNUSED(nb); | ||
| 257 | UNUSED(x); | ||
| 258 | UNUSED(y); | ||
| 259 | UNUSED(ib); | ||
| 260 | UNUSED(sumf); | ||
| 261 | ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 262 | #endif | ||
| 263 | } | ||
| 264 | |||
| 265 | void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 266 | assert(nrc == 1); | ||
| 267 | UNUSED(nrc); | ||
| 268 | UNUSED(bx); | ||
| 269 | UNUSED(by); | ||
| 270 | UNUSED(bs); | ||
| 271 | assert(n % QK_MXFP4 == 0); | ||
| 272 | static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same"); | ||
| 273 | |||
| 274 | const int qk = QK_MXFP4; | ||
| 275 | const int nb = n / qk; | ||
| 276 | |||
| 277 | const block_mxfp4 * GGML_RESTRICT x = vx; | ||
| 278 | const block_q8_0 * GGML_RESTRICT y = vy; | ||
| 279 | |||
| 280 | int ib = 0; | ||
| 281 | float sumf = 0.0f; | ||
| 282 | |||
| 283 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 284 | const int8x16_t v_k = vec_xl(0, kvalues_mxfp4); | ||
| 285 | const uint8x16_t v_m = vec_splats((const uint8_t)0x0F); | ||
| 286 | |||
| 287 | float32x4_t v_acc = vec_splats(0.0f); | ||
| 288 | |||
| 289 | #pragma GCC unroll 8 | ||
| 290 | for (; ib + 1 < nb; ib += 2) { | ||
| 291 | const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0]; | ||
| 292 | const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1]; | ||
| 293 | const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; | ||
| 294 | const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; | ||
| 295 | |||
| 296 | const uint8x16_t v_x0 = vec_xl(0, x0->qs); | ||
| 297 | const uint8x16_t v_x1 = vec_xl(0, x1->qs); | ||
| 298 | |||
| 299 | int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m); | ||
| 300 | int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4); | ||
| 301 | int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m); | ||
| 302 | int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4); | ||
| 303 | |||
| 304 | v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l); | ||
| 305 | v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h); | ||
| 306 | v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l); | ||
| 307 | v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h); | ||
| 308 | |||
| 309 | const int8x16_t v_y0l = vec_xl(0, y0->qs); | ||
| 310 | const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs); | ||
| 311 | const int8x16_t v_y1l = vec_xl(0, y1->qs); | ||
| 312 | const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs); | ||
| 313 | |||
| 314 | const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h); | ||
| 315 | const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h); | ||
| 316 | |||
| 317 | const float32x4_t v_xy0f = vec_float(v_xy0); | ||
| 318 | const float32x4_t v_xy1f = vec_float(v_xy1); | ||
| 319 | |||
| 320 | const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d)); | ||
| 321 | const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d)); | ||
| 322 | |||
| 323 | v_acc = vec_madd(v_xy0f, v_d0, v_acc); | ||
| 324 | v_acc = vec_madd(v_xy1f, v_d1, v_acc); | ||
| 325 | } | ||
| 326 | |||
| 327 | for (; ib < nb; ++ib) { | ||
| 328 | const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0]; | ||
| 329 | const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; | ||
| 330 | |||
| 331 | const uint8x16_t v_x = vec_xl(0, x0->qs); | ||
| 332 | |||
| 333 | int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); | ||
| 334 | int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4); | ||
| 335 | |||
| 336 | v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl); | ||
| 337 | v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh); | ||
| 338 | |||
| 339 | const int8x16_t v_yl = vec_xl(0, y0->qs); | ||
| 340 | const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs); | ||
| 341 | |||
| 342 | const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); | ||
| 343 | const float32x4_t v_xyf = vec_float(v_xy); | ||
| 344 | |||
| 345 | const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d)); | ||
| 346 | v_acc = vec_madd(v_xyf, v_d, v_acc); | ||
| 347 | } | ||
| 348 | |||
| 349 | sumf = vec_hsum_f32x4(v_acc); | ||
| 350 | *s = sumf; | ||
| 351 | #else | ||
| 352 | UNUSED(x); | ||
| 353 | UNUSED(y); | ||
| 354 | UNUSED(ib); | ||
| 355 | UNUSED(sumf); | ||
| 356 | ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 357 | #endif | ||
| 358 | } | ||
| 359 | |||
| 360 | void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 361 | const int qk = QK8_0; | ||
| 362 | const int nb = n / qk; | ||
| 363 | |||
| 364 | assert(n % qk == 0); | ||
| 365 | assert(qk == QK5_0); | ||
| 366 | assert(nrc == 1); | ||
| 367 | UNUSED(nrc); | ||
| 368 | UNUSED(bx); | ||
| 369 | UNUSED(by); | ||
| 370 | UNUSED(bs); | ||
| 371 | |||
| 372 | const block_q5_0 * GGML_RESTRICT x = vx; | ||
| 373 | const block_q8_0 * GGML_RESTRICT y = vy; | ||
| 374 | |||
| 375 | int ib = 0; | ||
| 376 | float sumf = 0.0f; | ||
| 377 | |||
| 378 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 379 | float32x4_t v_sum0 = vec_splats(0.0f); | ||
| 380 | float32x4_t v_sum1 = vec_splats(0.0f); | ||
| 381 | |||
| 382 | uint32_t qh0, qh1; | ||
| 383 | uint64_t tmp0[4], tmp1[4]; | ||
| 384 | |||
| 385 | const uint8x16_t v_m = vec_splats((uint8_t)0x0F); | ||
| 386 | |||
| 387 | #pragma GCC unroll 4 | ||
| 388 | for (; ib + 1 < nb; ib += 2) { | ||
| 389 | const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0]; | ||
| 390 | const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1]; | ||
| 391 | const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; | ||
| 392 | const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; | ||
| 393 | |||
| 394 | memcpy(&qh0, x0->qh, sizeof(qh0)); | ||
| 395 | memcpy(&qh1, x1->qh, sizeof(qh1)); | ||
| 396 | |||
| 397 | tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; | ||
| 398 | tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; | ||
| 399 | tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; | ||
| 400 | tmp0[3] = table_b2b_1[(qh0 >> 24) ]; | ||
| 401 | |||
| 402 | tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; | ||
| 403 | tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; | ||
| 404 | tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; | ||
| 405 | tmp1[3] = table_b2b_1[(qh1 >> 24) ]; | ||
| 406 | |||
| 407 | int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0)); | ||
| 408 | int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2)); | ||
| 409 | int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0)); | ||
| 410 | int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2)); | ||
| 411 | |||
| 412 | // required for fixing the byteorder | ||
| 413 | v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm); | ||
| 414 | v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm); | ||
| 415 | v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm); | ||
| 416 | v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm); | ||
| 417 | |||
| 418 | const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs); | ||
| 419 | const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs); | ||
| 420 | |||
| 421 | int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m); | ||
| 422 | int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4); | ||
| 423 | int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m); | ||
| 424 | int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4); | ||
| 425 | |||
| 426 | const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l); | ||
| 427 | const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h); | ||
| 428 | const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l); | ||
| 429 | const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h); | ||
| 430 | |||
| 431 | const int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs); | ||
| 432 | const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs); | ||
| 433 | const int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs); | ||
| 434 | const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs); | ||
| 435 | |||
| 436 | const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); | ||
| 437 | const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); | ||
| 438 | |||
| 439 | const float32x4_t v_xy0f = vec_float(v_xy0); | ||
| 440 | const float32x4_t v_xy1f = vec_float(v_xy1); | ||
| 441 | |||
| 442 | const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); | ||
| 443 | const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d)); | ||
| 444 | |||
| 445 | v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0); | ||
| 446 | v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1); | ||
| 447 | } | ||
| 448 | |||
| 449 | sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1); | ||
| 450 | |||
| 451 | #pragma GCC unroll 4 | ||
| 452 | for (; ib < nb; ++ib) { | ||
| 453 | const block_q5_0 * GGML_RESTRICT x0 = &x[ib]; | ||
| 454 | const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; | ||
| 455 | |||
| 456 | uint32_t qh; | ||
| 457 | memcpy(&qh, x0->qh, sizeof(qh)); | ||
| 458 | |||
| 459 | uint64_t tmp[4]; | ||
| 460 | tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; | ||
| 461 | tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; | ||
| 462 | tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; | ||
| 463 | tmp[3] = table_b2b_1[(qh >> 24) ]; | ||
| 464 | |||
| 465 | int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0)); | ||
| 466 | int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2)); | ||
| 467 | |||
| 468 | // required for fixing the byteorder | ||
| 469 | v_qhl = vec_perm(v_qhl, v_qhl, v_kperm); | ||
| 470 | v_qhh = vec_perm(v_qhh, v_qhh, v_kperm); | ||
| 471 | |||
| 472 | const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs); | ||
| 473 | int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); | ||
| 474 | int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4); | ||
| 475 | |||
| 476 | const int8x16_t v_xlf = vec_sub(v_xl, v_qhl); | ||
| 477 | const int8x16_t v_xhf = vec_sub(v_xh, v_qhh); | ||
| 478 | |||
| 479 | const int8x16_t v_yl = vec_xl(0, (const int8_t *)y0->qs); | ||
| 480 | const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs); | ||
| 481 | |||
| 482 | const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh); | ||
| 483 | const float32x4_t v_xyf = vec_float(v_xy); | ||
| 484 | |||
| 485 | const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); | ||
| 486 | const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f)); | ||
| 487 | |||
| 488 | sumf += vec_hsum_f32x4(v_acc); | ||
| 489 | } | ||
| 490 | |||
| 491 | *s = sumf; | ||
| 492 | #else | ||
| 493 | UNUSED(nb); | ||
| 494 | UNUSED(x); | ||
| 495 | UNUSED(y); | ||
| 496 | UNUSED(ib); | ||
| 497 | UNUSED(sumf); | ||
| 498 | ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 499 | #endif | ||
| 500 | } | ||
| 501 | |||
| 502 | void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 503 | const int qk = QK8_1; | ||
| 504 | const int nb = n / qk; | ||
| 505 | |||
| 506 | assert(n % qk == 0); | ||
| 507 | assert(qk == QK5_1); | ||
| 508 | assert(nrc == 1); | ||
| 509 | UNUSED(nrc); | ||
| 510 | UNUSED(bx); | ||
| 511 | UNUSED(by); | ||
| 512 | UNUSED(bs); | ||
| 513 | |||
| 514 | const block_q5_1 * GGML_RESTRICT x = vx; | ||
| 515 | const block_q8_1 * GGML_RESTRICT y = vy; | ||
| 516 | |||
| 517 | int ib = 0; | ||
| 518 | float sumf = 0.0f; | ||
| 519 | |||
| 520 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 521 | float32x4_t v_sum0 = vec_splats(0.0f); | ||
| 522 | float32x4_t v_sum1 = vec_splats(0.0f); | ||
| 523 | |||
| 524 | float summs0 = 0.0f; | ||
| 525 | float summs1 = 0.0f; | ||
| 526 | |||
| 527 | uint32_t qh0; | ||
| 528 | uint32_t qh1; | ||
| 529 | |||
| 530 | uint64_t tmp0[4]; | ||
| 531 | uint64_t tmp1[4]; | ||
| 532 | |||
| 533 | const uint8x16_t v_m = vec_splats((uint8_t)0x0F); | ||
| 534 | |||
| 535 | #pragma GCC unroll 4 | ||
| 536 | for (; ib + 1 < nb; ib += 2) { | ||
| 537 | const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0]; | ||
| 538 | const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1]; | ||
| 539 | const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0]; | ||
| 540 | const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; | ||
| 541 | |||
| 542 | summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); | ||
| 543 | summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s); | ||
| 544 | |||
| 545 | memcpy(&qh0, x0->qh, sizeof(qh0)); | ||
| 546 | memcpy(&qh1, x1->qh, sizeof(qh1)); | ||
| 547 | |||
| 548 | tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; | ||
| 549 | tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; | ||
| 550 | tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; | ||
| 551 | tmp0[3] = table_b2b_0[(qh0 >> 24) ]; | ||
| 552 | |||
| 553 | tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; | ||
| 554 | tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; | ||
| 555 | tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; | ||
| 556 | tmp1[3] = table_b2b_0[(qh1 >> 24) ]; | ||
| 557 | |||
| 558 | int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0)); | ||
| 559 | int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2)); | ||
| 560 | int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0)); | ||
| 561 | int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2)); | ||
| 562 | |||
| 563 | // required for fixing the byteorder | ||
| 564 | v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm); | ||
| 565 | v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm); | ||
| 566 | v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm); | ||
| 567 | v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm); | ||
| 568 | |||
| 569 | const uint8x16_t v_x0 = vec_xl(0, x0->qs); | ||
| 570 | const uint8x16_t v_x1 = vec_xl(0, x1->qs); | ||
| 571 | |||
| 572 | const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m); | ||
| 573 | const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4); | ||
| 574 | const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m); | ||
| 575 | const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4); | ||
| 576 | |||
| 577 | const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l); | ||
| 578 | const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h); | ||
| 579 | const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l); | ||
| 580 | const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h); | ||
| 581 | |||
| 582 | const int8x16_t v_y0l = vec_xl(0 , y0->qs); | ||
| 583 | const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs); | ||
| 584 | const int8x16_t v_y1l = vec_xl(0 , y1->qs); | ||
| 585 | const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs); | ||
| 586 | |||
| 587 | const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); | ||
| 588 | const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); | ||
| 589 | |||
| 590 | const float32x4_t v_xy0f = vec_float(v_xy0); | ||
| 591 | const float32x4_t v_xy1f = vec_float(v_xy1); | ||
| 592 | |||
| 593 | const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); | ||
| 594 | const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d)); | ||
| 595 | |||
| 596 | v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0); | ||
| 597 | v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1); | ||
| 598 | } | ||
| 599 | |||
| 600 | sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1; | ||
| 601 | |||
| 602 | #pragma GCC unroll 4 | ||
| 603 | for (; ib < nb; ++ib) { | ||
| 604 | const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; | ||
| 605 | const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; | ||
| 606 | |||
| 607 | float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); | ||
| 608 | |||
| 609 | uint32_t qh; | ||
| 610 | memcpy(&qh, x0->qh, sizeof(qh)); | ||
| 611 | |||
| 612 | uint64_t tmp[4]; | ||
| 613 | tmp[0] = table_b2b_0[(qh >> 0) & 0xFF]; | ||
| 614 | tmp[1] = table_b2b_0[(qh >> 8) & 0xFF]; | ||
| 615 | tmp[2] = table_b2b_0[(qh >> 16) & 0xFF]; | ||
| 616 | tmp[3] = table_b2b_0[(qh >> 24) ]; | ||
| 617 | |||
| 618 | int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0)); | ||
| 619 | int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2)); | ||
| 620 | |||
| 621 | // required for fixing the byteorder | ||
| 622 | v_qhl = vec_perm(v_qhl, v_qhl, v_kperm); | ||
| 623 | v_qhh = vec_perm(v_qhh, v_qhh, v_kperm); | ||
| 624 | |||
| 625 | const uint8x16_t v_x = vec_xl(0, x0->qs); | ||
| 626 | const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); | ||
| 627 | const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4); | ||
| 628 | |||
| 629 | const int8x16_t v_xlf = vec_or(v_xl, v_qhl); | ||
| 630 | const int8x16_t v_xhf = vec_or(v_xh, v_qhh); | ||
| 631 | |||
| 632 | const int8x16_t v_yl = vec_xl(0 , y0->qs); | ||
| 633 | const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs); | ||
| 634 | |||
| 635 | const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh); | ||
| 636 | const float32x4_t v_xyf = vec_float(v_xy); | ||
| 637 | |||
| 638 | const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); | ||
| 639 | const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc); | ||
| 640 | |||
| 641 | sumf += vec_hsum_f32x4(v_acc) + summs; | ||
| 642 | } | ||
| 643 | |||
| 644 | *s = sumf; | ||
| 645 | #else | ||
| 646 | UNUSED(nb); | ||
| 647 | UNUSED(x); | ||
| 648 | UNUSED(y); | ||
| 649 | UNUSED(ib); | ||
| 650 | UNUSED(sumf); | ||
| 651 | ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 652 | #endif | ||
| 653 | } | ||
| 654 | |||
| 655 | void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 656 | const int qk = QK8_0; | ||
| 657 | const int nb = n / qk; | ||
| 658 | |||
| 659 | assert(n % qk == 0); | ||
| 660 | assert(nrc == 1); | ||
| 661 | UNUSED(nrc); | ||
| 662 | UNUSED(bx); | ||
| 663 | UNUSED(by); | ||
| 664 | UNUSED(bs); | ||
| 665 | |||
| 666 | const block_q8_0 * GGML_RESTRICT x = vx; | ||
| 667 | const block_q8_0 * GGML_RESTRICT y = vy; | ||
| 668 | |||
| 669 | int ib = 0; | ||
| 670 | float sumf = 0; | ||
| 671 | |||
| 672 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 673 | float32x4_t acc = vec_splats(0.0f); | ||
| 674 | |||
| 675 | #pragma GCC unroll 8 | ||
| 676 | for (; ib < nb; ++ib) { | ||
| 677 | __builtin_prefetch(x[ib].qs, 0, 1); | ||
| 678 | __builtin_prefetch(y[ib].qs, 0, 1); | ||
| 679 | |||
| 680 | const int8x16_t v_xl = vec_xl(0 , x[ib].qs); | ||
| 681 | const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs); | ||
| 682 | const int8x16_t v_yl = vec_xl(0 , y[ib].qs); | ||
| 683 | const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs); | ||
| 684 | |||
| 685 | const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); | ||
| 686 | const float32x4_t v_xy = vec_float(v_xy_); | ||
| 687 | const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); | ||
| 688 | |||
| 689 | acc = vec_madd(v_xy, v_d, acc); | ||
| 690 | } | ||
| 691 | |||
| 692 | sumf = vec_hsum_f32x4(acc); | ||
| 693 | |||
| 694 | *s = sumf; | ||
| 695 | #else | ||
| 696 | UNUSED(nb); | ||
| 697 | UNUSED(x); | ||
| 698 | UNUSED(y); | ||
| 699 | UNUSED(ib); | ||
| 700 | UNUSED(sumf); | ||
| 701 | ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 702 | #endif | ||
| 703 | } | ||
| 704 | |||
| 705 | void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 706 | assert(n % QK_K == 0); | ||
| 707 | assert(nrc == 1); | ||
| 708 | UNUSED(nrc); | ||
| 709 | UNUSED(bx); | ||
| 710 | UNUSED(by); | ||
| 711 | UNUSED(bs); | ||
| 712 | |||
| 713 | const uint32_t kmask1 = 0x03030303; | ||
| 714 | const uint32_t kmask2 = 0x0f0f0f0f; | ||
| 715 | |||
| 716 | const block_q3_K * GGML_RESTRICT x = vx; | ||
| 717 | const block_q8_K * GGML_RESTRICT y = vy; | ||
| 718 | |||
| 719 | const int nb = n / QK_K; | ||
| 720 | |||
| 721 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 722 | uint32_t aux[3]; | ||
| 723 | uint32_t utmp[4]; | ||
| 724 | |||
| 725 | const int32x4_t v_z = vec_splat_s32(0); | ||
| 726 | const uint8x16_t v_3m = vec_splat_u8(0x03); | ||
| 727 | |||
| 728 | const uint8x16_t v_0c = vec_splat_u8(1); | ||
| 729 | const uint8x16_t v_1c = vec_sl(v_0c, 1); | ||
| 730 | const uint8x16_t v_2c = vec_sl(v_0c, 2); | ||
| 731 | const uint8x16_t v_3c = vec_sl(v_0c, 3); | ||
| 732 | |||
| 733 | uint8x16_t q3h[4]; | ||
| 734 | uint8x16_t q3b[2]; | ||
| 735 | int8x16_t q3bytes[4]; | ||
| 736 | int8x16_t q8bytes[8]; | ||
| 737 | uint8x16_t qhbits[2]; | ||
| 738 | |||
| 739 | float sum = 0; | ||
| 740 | |||
| 741 | for (int i = 0; i < nb; ++i) { | ||
| 742 | const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); | ||
| 743 | |||
| 744 | const uint8_t * restrict x0l = x[i].qs; | ||
| 745 | const uint8_t * restrict x0h = x[i].hmask; | ||
| 746 | const int8_t * restrict y0 = y[i].qs; | ||
| 747 | |||
| 748 | qhbits[0] = vec_xl(0 , x0h); | ||
| 749 | qhbits[1] = vec_xl(16, x0h); | ||
| 750 | |||
| 751 | int32_t isum = 0; | ||
| 752 | |||
| 753 | memcpy(aux, x[i].scales, 12); | ||
| 754 | utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); | ||
| 755 | utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); | ||
| 756 | utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); | ||
| 757 | utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); | ||
| 758 | |||
| 759 | int8_t * scale = (int8_t *)utmp; | ||
| 760 | for (int j = 0; j < 16; ++j) scale[j] -= 32; | ||
| 761 | |||
| 762 | for (int j = 0; j < QK_K/128; ++j) { | ||
| 763 | int32x4_t isum0, isum1, isum2, isum3; | ||
| 764 | |||
| 765 | q3b[0] = vec_xl(0 , x0l); | ||
| 766 | q3b[1] = vec_xl(16, x0l); | ||
| 767 | x0l += 32; | ||
| 768 | |||
| 769 | q8bytes[0] = vec_xl(0 , y0); | ||
| 770 | q8bytes[1] = vec_xl(16 , y0); | ||
| 771 | q8bytes[2] = vec_xl(32 , y0); | ||
| 772 | q8bytes[3] = vec_xl(48 , y0); | ||
| 773 | q8bytes[4] = vec_xl(64 , y0); | ||
| 774 | q8bytes[5] = vec_xl(80 , y0); | ||
| 775 | q8bytes[6] = vec_xl(96 , y0); | ||
| 776 | q8bytes[7] = vec_xl(112, y0); | ||
| 777 | y0 += 128; | ||
| 778 | |||
| 779 | q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2); | ||
| 780 | q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2); | ||
| 781 | q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1); | ||
| 782 | q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1); | ||
| 783 | |||
| 784 | q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]); | ||
| 785 | q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]); | ||
| 786 | q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]); | ||
| 787 | q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]); | ||
| 788 | |||
| 789 | isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]); | ||
| 790 | isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]); | ||
| 791 | isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]); | ||
| 792 | isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]); | ||
| 793 | |||
| 794 | isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0]; | ||
| 795 | isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1]; | ||
| 796 | isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2]; | ||
| 797 | isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3]; | ||
| 798 | |||
| 799 | scale += 4; | ||
| 800 | |||
| 801 | q3h[0] = vec_andc(v_2c, qhbits[0]); | ||
| 802 | q3h[1] = vec_andc(v_2c, qhbits[1]); | ||
| 803 | q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1); | ||
| 804 | q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1); | ||
| 805 | |||
| 806 | q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]); | ||
| 807 | q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]); | ||
| 808 | q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]); | ||
| 809 | q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]); | ||
| 810 | |||
| 811 | isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]); | ||
| 812 | isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]); | ||
| 813 | isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]); | ||
| 814 | isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]); | ||
| 815 | |||
| 816 | isum += vec_hsum_i32x4(isum0) * scale[0]; | ||
| 817 | isum += vec_hsum_i32x4(isum1) * scale[1]; | ||
| 818 | isum += vec_hsum_i32x4(isum2) * scale[2]; | ||
| 819 | isum += vec_hsum_i32x4(isum3) * scale[3]; | ||
| 820 | |||
| 821 | scale += 4; | ||
| 822 | |||
| 823 | if (j == 0) { | ||
| 824 | qhbits[0] = vec_sr(qhbits[0], 4); | ||
| 825 | qhbits[1] = vec_sr(qhbits[1], 4); | ||
| 826 | } | ||
| 827 | } | ||
| 828 | |||
| 829 | sum += d * isum; | ||
| 830 | } | ||
| 831 | |||
| 832 | *s = sum; | ||
| 833 | |||
| 834 | #else | ||
| 835 | UNUSED(kmask1); | ||
| 836 | UNUSED(kmask2); | ||
| 837 | UNUSED(x); | ||
| 838 | UNUSED(y); | ||
| 839 | UNUSED(nb); | ||
| 840 | ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 841 | #endif | ||
| 842 | } | ||
| 843 | |||
| 844 | void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 845 | assert(n % QK_K == 0); | ||
| 846 | assert(nrc == 1); | ||
| 847 | UNUSED(nrc); | ||
| 848 | UNUSED(bx); | ||
| 849 | UNUSED(by); | ||
| 850 | UNUSED(bs); | ||
| 851 | |||
| 852 | const block_q4_K * GGML_RESTRICT x = vx; | ||
| 853 | const block_q8_K * GGML_RESTRICT y = vy; | ||
| 854 | |||
| 855 | const int nb = n / QK_K; | ||
| 856 | |||
| 857 | static const uint32_t kmask1 = 0x3f3f3f3f; | ||
| 858 | static const uint32_t kmask2 = 0x0f0f0f0f; | ||
| 859 | static const uint32_t kmask3 = 0x03030303; | ||
| 860 | |||
| 861 | uint32_t utmp[4]; | ||
| 862 | |||
| 863 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 864 | const uint8x16_t v_lm = vec_splat_u8(0x0F); | ||
| 865 | const int32x4_t v_z = vec_splat_s32(0); | ||
| 866 | |||
| 867 | uint8x16_t v_x[2]; | ||
| 868 | int8x16_t v_xl[2]; | ||
| 869 | int8x16_t v_y[2]; | ||
| 870 | |||
| 871 | float sumf = 0; | ||
| 872 | |||
| 873 | for (int i = 0; i < nb; ++i) { | ||
| 874 | const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); | ||
| 875 | const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); | ||
| 876 | |||
| 877 | const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); | ||
| 878 | const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); | ||
| 879 | const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh); | ||
| 880 | |||
| 881 | memcpy(utmp, x[i].scales, 12); | ||
| 882 | |||
| 883 | uint32x4_t v_mins8 = { 0 }; | ||
| 884 | v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0); | ||
| 885 | v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1); | ||
| 886 | |||
| 887 | utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); | ||
| 888 | utmp[0] &= kmask1; | ||
| 889 | |||
| 890 | const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8); | ||
| 891 | |||
| 892 | const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh); | ||
| 893 | const int32x4_t v_minse = vec_mule(v_ysums, v_minsh); | ||
| 894 | const int32x4_t v_mins = v_minso + v_minse; | ||
| 895 | sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]); | ||
| 896 | |||
| 897 | const uint8_t * scales = (const uint8_t *)utmp; | ||
| 898 | const uint8_t * GGML_RESTRICT x0 = x[i].qs; | ||
| 899 | const int8_t * GGML_RESTRICT y0 = y[i].qs; | ||
| 900 | |||
| 901 | int32_t sumi1 = 0; | ||
| 902 | int32_t sumi2 = 0; | ||
| 903 | |||
| 904 | for (int j = 0; j < QK_K/64; ++j) { | ||
| 905 | v_x[0] = vec_xl(0 , x0); | ||
| 906 | v_x[1] = vec_xl(16, x0); | ||
| 907 | x0 += 32; | ||
| 908 | |||
| 909 | v_y[0] = vec_xl(0 , y0); | ||
| 910 | v_y[1] = vec_xl(16, y0); | ||
| 911 | y0 += 32; | ||
| 912 | |||
| 913 | v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm); | ||
| 914 | v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm); | ||
| 915 | |||
| 916 | const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]); | ||
| 917 | sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0]; | ||
| 918 | |||
| 919 | v_y[0] = vec_xl(0 , y0); | ||
| 920 | v_y[1] = vec_xl(16, y0); | ||
| 921 | y0 += 32; | ||
| 922 | |||
| 923 | v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4); | ||
| 924 | v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4); | ||
| 925 | |||
| 926 | const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]); | ||
| 927 | sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1]; | ||
| 928 | } | ||
| 929 | |||
| 930 | sumf += d * (sumi1 + sumi2); | ||
| 931 | } | ||
| 932 | |||
| 933 | *s = sumf; | ||
| 934 | |||
| 935 | #else | ||
| 936 | UNUSED(x); | ||
| 937 | UNUSED(y); | ||
| 938 | UNUSED(nb); | ||
| 939 | UNUSED(kmask1); | ||
| 940 | UNUSED(kmask2); | ||
| 941 | UNUSED(kmask3); | ||
| 942 | UNUSED(utmp); | ||
| 943 | ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 944 | #endif | ||
| 945 | } | ||
| 946 | |||
| 947 | void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 948 | assert(n % QK_K == 0); | ||
| 949 | assert(nrc == 1); | ||
| 950 | UNUSED(nrc); | ||
| 951 | UNUSED(bx); | ||
| 952 | UNUSED(by); | ||
| 953 | UNUSED(bs); | ||
| 954 | |||
| 955 | const block_q5_K * GGML_RESTRICT x = vx; | ||
| 956 | const block_q8_K * GGML_RESTRICT y = vy; | ||
| 957 | |||
| 958 | const int nb = n / QK_K; | ||
| 959 | |||
| 960 | static const uint32_t kmask1 = 0x3f3f3f3f; | ||
| 961 | static const uint32_t kmask2 = 0x0f0f0f0f; | ||
| 962 | static const uint32_t kmask3 = 0x03030303; | ||
| 963 | |||
| 964 | uint32_t utmp[4]; | ||
| 965 | |||
| 966 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 967 | const uint8x16_t v_lm = vec_splat_u8(0x0F); | ||
| 968 | const uint8x16_t v_1m = vec_splat_u8(0x01); | ||
| 969 | const uint8x16_t v_2m = vec_splat_u8(0x02); | ||
| 970 | |||
| 971 | const int32x4_t v_z = vec_splat_s32(0); | ||
| 972 | |||
| 973 | const uchar8x16_t v_minsm = { | ||
| 974 | 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, | ||
| 975 | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF | ||
| 976 | }; | ||
| 977 | |||
| 978 | int8x16_t q5b[4]; | ||
| 979 | uint8x16_t q5h[4]; | ||
| 980 | |||
| 981 | uint8x16_t v_xl[2]; | ||
| 982 | uint8x16_t v_xh[2]; | ||
| 983 | int8x16_t v_y[4]; | ||
| 984 | |||
| 985 | float sumf = 0; | ||
| 986 | |||
| 987 | for (int i = 0; i < nb; ++i) { | ||
| 988 | const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); | ||
| 989 | const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); | ||
| 990 | |||
| 991 | const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); | ||
| 992 | const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); | ||
| 993 | const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh); | ||
| 994 | |||
| 995 | memcpy(utmp, x[i].scales, 12); | ||
| 996 | utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); | ||
| 997 | const uint32_t uaux = utmp[1] & kmask1; | ||
| 998 | utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); | ||
| 999 | utmp[2] = uaux; | ||
| 1000 | utmp[0] &= kmask1; | ||
| 1001 | |||
| 1002 | const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp); | ||
| 1003 | const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm); | ||
| 1004 | const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8); | ||
| 1005 | |||
| 1006 | const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh); | ||
| 1007 | const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh); | ||
| 1008 | const int32x4_t v_mins = vec_add(v_minsho, v_minshe); | ||
| 1009 | const int32_t mins = vec_hsum_i32x4(v_mins); | ||
| 1010 | |||
| 1011 | const uint8_t * scales = (const uint8_t *)utmp; | ||
| 1012 | const uint8_t * GGML_RESTRICT x0l = x[i].qs; | ||
| 1013 | const uint8_t * GGML_RESTRICT x0h = x[i].qh; | ||
| 1014 | const int8_t * GGML_RESTRICT y0 = y[i].qs; | ||
| 1015 | |||
| 1016 | v_xh[0] = vec_xl(0 , x0h); | ||
| 1017 | v_xh[1] = vec_xl(16, x0h); | ||
| 1018 | |||
| 1019 | int32_t sumi = 0; | ||
| 1020 | for (int j = 0; j < QK_K/64; ++j) { | ||
| 1021 | v_xl[0] = vec_xl(0 , x0l); | ||
| 1022 | v_xl[1] = vec_xl(16, x0l); | ||
| 1023 | x0l += 32; | ||
| 1024 | |||
| 1025 | v_y[0] = vec_xl(0 , y0); | ||
| 1026 | v_y[1] = vec_xl(16, y0); | ||
| 1027 | v_y[2] = vec_xl(32, y0); | ||
| 1028 | v_y[3] = vec_xl(48, y0); | ||
| 1029 | y0 += 64; | ||
| 1030 | |||
| 1031 | q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4); | ||
| 1032 | q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4); | ||
| 1033 | q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3); | ||
| 1034 | q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3); | ||
| 1035 | v_xh[0] = vec_sr(v_xh[0], 2); | ||
| 1036 | v_xh[1] = vec_sr(v_xh[1], 2); | ||
| 1037 | |||
| 1038 | q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]); | ||
| 1039 | q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]); | ||
| 1040 | q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]); | ||
| 1041 | q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]); | ||
| 1042 | |||
| 1043 | int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]); | ||
| 1044 | int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]); | ||
| 1045 | |||
| 1046 | sumi += vec_hsum_i32x4(sumi0) * *scales++; | ||
| 1047 | sumi += vec_hsum_i32x4(sumi1) * *scales++; | ||
| 1048 | } | ||
| 1049 | |||
| 1050 | sumf += d * sumi - dmin * mins; | ||
| 1051 | } | ||
| 1052 | |||
| 1053 | *s = sumf; | ||
| 1054 | |||
| 1055 | #else | ||
| 1056 | UNUSED(x); | ||
| 1057 | UNUSED(y); | ||
| 1058 | UNUSED(nb); | ||
| 1059 | UNUSED(kmask1); | ||
| 1060 | UNUSED(kmask2); | ||
| 1061 | UNUSED(kmask3); | ||
| 1062 | UNUSED(utmp); | ||
| 1063 | ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 1064 | #endif | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 1068 | assert(n % QK_K == 0); | ||
| 1069 | assert(nrc == 1); | ||
| 1070 | UNUSED(nrc); | ||
| 1071 | UNUSED(bx); | ||
| 1072 | UNUSED(by); | ||
| 1073 | UNUSED(bs); | ||
| 1074 | |||
| 1075 | const block_q6_K * GGML_RESTRICT x = vx; | ||
| 1076 | const block_q8_K * GGML_RESTRICT y = vy; | ||
| 1077 | |||
| 1078 | const int nb = n / QK_K; | ||
| 1079 | |||
| 1080 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 1081 | float sum = 0; | ||
| 1082 | |||
| 1083 | // Lower 4-bit and upper 2-bit masks | ||
| 1084 | const uint8x16_t v_lm = vec_splat_u8(0x0F); | ||
| 1085 | const uint8x16_t v_um = vec_splat_u8(0x03); | ||
| 1086 | |||
| 1087 | const int32x4_t v_z = vec_splat_s32(0); | ||
| 1088 | |||
| 1089 | int8x16_t q6b[4]; | ||
| 1090 | uint8x16_t q6h[4]; | ||
| 1091 | |||
| 1092 | uint8x16_t v_xl[4]; | ||
| 1093 | uint8x16_t v_xh[2]; | ||
| 1094 | int8x16_t v_y[4]; | ||
| 1095 | |||
| 1096 | for (int i = 0; i < nb; ++i) { | ||
| 1097 | const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); | ||
| 1098 | |||
| 1099 | const uint8_t * GGML_RESTRICT x0l = x[i].ql; | ||
| 1100 | const uint8_t * GGML_RESTRICT x0h = x[i].qh; | ||
| 1101 | const int8_t * GGML_RESTRICT y0 = y[i].qs; | ||
| 1102 | |||
| 1103 | const int8_t * GGML_RESTRICT scale = x[i].scales; | ||
| 1104 | |||
| 1105 | const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); | ||
| 1106 | const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); | ||
| 1107 | |||
| 1108 | const int8x16_t v_scale = vec_xl(0, scale); | ||
| 1109 | const int16x8_t v_scalel = vec_unpackh(v_scale); | ||
| 1110 | const int16x8_t v_scaleh = vec_unpackl(v_scale); | ||
| 1111 | |||
| 1112 | const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel); | ||
| 1113 | const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel); | ||
| 1114 | const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh); | ||
| 1115 | const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh); | ||
| 1116 | const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe; | ||
| 1117 | |||
| 1118 | const int32_t mins = vec_hsum_i32x4(v_mins); | ||
| 1119 | |||
| 1120 | int32_t isum = 0; | ||
| 1121 | for (int j = 0; j < QK_K/128; ++j) { | ||
| 1122 | // Load model upper 2 bits | ||
| 1123 | v_xh[0] = vec_xl(0 , x0h); | ||
| 1124 | v_xh[1] = vec_xl(16, x0h); | ||
| 1125 | x0h += 32; | ||
| 1126 | |||
| 1127 | // Load model lower 4 bits | ||
| 1128 | v_xl[0] = vec_xl(0 , x0l); | ||
| 1129 | v_xl[1] = vec_xl(16, x0l); | ||
| 1130 | v_xl[2] = vec_xl(32, x0l); | ||
| 1131 | v_xl[3] = vec_xl(48, x0l); | ||
| 1132 | x0l += 64; | ||
| 1133 | |||
| 1134 | // Load activation quants | ||
| 1135 | v_y[0] = vec_xl(0 , y0); | ||
| 1136 | v_y[1] = vec_xl(16, y0); | ||
| 1137 | v_y[2] = vec_xl(32, y0); | ||
| 1138 | v_y[3] = vec_xl(48, y0); | ||
| 1139 | y0 += 64; | ||
| 1140 | |||
| 1141 | q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4); | ||
| 1142 | q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4); | ||
| 1143 | uint8x16_t shifted = vec_sr(v_xh[0], 2); | ||
| 1144 | q6h[2] = vec_sl(vec_and(v_um, shifted), 4); | ||
| 1145 | shifted = vec_sr(v_xh[1], 2); | ||
| 1146 | q6h[3] = vec_sl(vec_and(v_um, shifted), 4); | ||
| 1147 | |||
| 1148 | q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0])); | ||
| 1149 | q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1])); | ||
| 1150 | q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2])); | ||
| 1151 | q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3])); | ||
| 1152 | |||
| 1153 | int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]); | ||
| 1154 | int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]); | ||
| 1155 | int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]); | ||
| 1156 | int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]); | ||
| 1157 | |||
| 1158 | isum += vec_hsum_i32x4(summs0) * scale[0] + | ||
| 1159 | vec_hsum_i32x4(summs1) * scale[1] + | ||
| 1160 | vec_hsum_i32x4(summs2) * scale[2] + | ||
| 1161 | vec_hsum_i32x4(summs3) * scale[3]; | ||
| 1162 | |||
| 1163 | scale += 4; | ||
| 1164 | |||
| 1165 | |||
| 1166 | // Load activation quants | ||
| 1167 | v_y[0] = vec_xl(0 , y0); | ||
| 1168 | v_y[1] = vec_xl(16, y0); | ||
| 1169 | v_y[2] = vec_xl(32, y0); | ||
| 1170 | v_y[3] = vec_xl(48, y0); | ||
| 1171 | y0 += 64; | ||
| 1172 | |||
| 1173 | shifted = vec_sr(v_xh[0], 4); | ||
| 1174 | q6h[0] = vec_sl(vec_and(v_um, shifted), 4); | ||
| 1175 | shifted = vec_sr(v_xh[1], 4); | ||
| 1176 | q6h[1] = vec_sl(vec_and(v_um, shifted), 4); | ||
| 1177 | shifted = vec_sr(v_xh[0], 6); | ||
| 1178 | q6h[2] = vec_sl(vec_and(v_um, shifted), 4); | ||
| 1179 | shifted = vec_sr(v_xh[1], 6); | ||
| 1180 | q6h[3] = vec_sl(vec_and(v_um, shifted), 4); | ||
| 1181 | |||
| 1182 | q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0])); | ||
| 1183 | q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1])); | ||
| 1184 | q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2])); | ||
| 1185 | q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3])); | ||
| 1186 | |||
| 1187 | summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]); | ||
| 1188 | summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]); | ||
| 1189 | summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]); | ||
| 1190 | summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]); | ||
| 1191 | |||
| 1192 | isum += vec_hsum_i32x4(summs0) * scale[0] + | ||
| 1193 | vec_hsum_i32x4(summs1) * scale[1] + | ||
| 1194 | vec_hsum_i32x4(summs2) * scale[2] + | ||
| 1195 | vec_hsum_i32x4(summs3) * scale[3]; | ||
| 1196 | |||
| 1197 | scale += 4; | ||
| 1198 | } | ||
| 1199 | |||
| 1200 | sum += d_all * y[i].d * (isum - 32 * mins); | ||
| 1201 | } | ||
| 1202 | |||
| 1203 | *s = sum; | ||
| 1204 | |||
| 1205 | #else | ||
| 1206 | UNUSED(x); | ||
| 1207 | UNUSED(y); | ||
| 1208 | UNUSED(nb); | ||
| 1209 | ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 1210 | #endif | ||
| 1211 | } | ||
| 1212 | |||
| 1213 | // #if defined(__VXE__) || defined(__VXE2__) | ||
| 1214 | // static const int8_t keven_signs_q2xs[1024] = { | ||
| 1215 | // 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, | ||
| 1216 | // 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, | ||
| 1217 | // 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1, | ||
| 1218 | // 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1, | ||
| 1219 | // 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, | ||
| 1220 | // 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, | ||
| 1221 | // 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1, | ||
| 1222 | // 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, | ||
| 1223 | // 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, | ||
| 1224 | // 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1, | ||
| 1225 | // 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, | ||
| 1226 | // 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, | ||
| 1227 | // 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1, | ||
| 1228 | // 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1, | ||
| 1229 | // 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, | ||
| 1230 | // 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, | ||
| 1231 | // 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, | ||
| 1232 | // 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, | ||
| 1233 | // 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1, | ||
| 1234 | // 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, | ||
| 1235 | // 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1, | ||
| 1236 | // 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1, | ||
| 1237 | // 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, | ||
| 1238 | // 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, | ||
| 1239 | // 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, | ||
| 1240 | // 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1, | ||
| 1241 | // 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1, | ||
| 1242 | // 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, | ||
| 1243 | // 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1, | ||
| 1244 | // 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, | ||
| 1245 | // 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, | ||
| 1246 | // 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, | ||
| 1247 | // }; | ||
| 1248 | // #endif | ||
| 1249 | |||
| 1250 | // void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 1251 | // assert(n % QK_K == 0); | ||
| 1252 | // assert(nrc == 1); | ||
| 1253 | // UNUSED(nrc); | ||
| 1254 | // UNUSED(bx); | ||
| 1255 | // UNUSED(by); | ||
| 1256 | // UNUSED(bs); | ||
| 1257 | |||
| 1258 | // const block_iq2_xxs * GGML_RESTRICT x = vx; | ||
| 1259 | // const block_q8_K * GGML_RESTRICT y = vy; | ||
| 1260 | |||
| 1261 | // const int nb = n / QK_K; | ||
| 1262 | |||
| 1263 | // #if defined(__VXE__) || defined(__VXE2__) | ||
| 1264 | // const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; | ||
| 1265 | |||
| 1266 | // uint32_t aux32[4]; | ||
| 1267 | // const uint8_t * aux8 = (const uint8_t *)aux32; | ||
| 1268 | |||
| 1269 | // float sumf = 0; | ||
| 1270 | |||
| 1271 | // for (int i = 0; i < nb; ++i) { | ||
| 1272 | // const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; | ||
| 1273 | // const uint16_t * GGML_RESTRICT q2 = x[i].qs; | ||
| 1274 | // const int8_t * GGML_RESTRICT q8 = y[i].qs; | ||
| 1275 | |||
| 1276 | // float sumf1 = 0, sumf2 = 0; | ||
| 1277 | |||
| 1278 | // for (int ib32 = 0; ib32 < QK_K/32; ib += 2) { | ||
| 1279 | // int8x16_t q8b0 = vec_xl( 0, q8); | ||
| 1280 | // int8x16_t qb81 = vec_xl(16, q8); | ||
| 1281 | // int8x16_t q8b2 = vec_xl(32, q8); | ||
| 1282 | // int8x16_t q8b3 = vec_xl(48, q8); | ||
| 1283 | // q8 += 64; | ||
| 1284 | |||
| 1285 | // memcpy(aux32, q2, 4 * sizeof(uint32_t)); | ||
| 1286 | // q2 += 8; | ||
| 1287 | |||
| 1288 | // int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) }; | ||
| 1289 | // int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) }; | ||
| 1290 | // int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) }; | ||
| 1291 | // int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) }; | ||
| 1292 | |||
| 1293 | // int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127)) }; | ||
| 1294 | // int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) }; | ||
| 1295 | // int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127)) }; | ||
| 1296 | // int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) }; | ||
| 1297 | |||
| 1298 | // q2u0 = vec_mul(q2u0, q2s0); | ||
| 1299 | // q2u1 = vec_mul(q2u1, q2s1); | ||
| 1300 | // q2u2 = vec_mul(q2u2, q2s2); | ||
| 1301 | // q2u3 = vec_mul(q2u3, q2s3); | ||
| 1302 | |||
| 1303 | // const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1); | ||
| 1304 | // const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3); | ||
| 1305 | |||
| 1306 | // sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28)); | ||
| 1307 | // sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28)); | ||
| 1308 | // } | ||
| 1309 | |||
| 1310 | // sumf += d * (sumf1 + sumf2); | ||
| 1311 | // } | ||
| 1312 | |||
| 1313 | // *s = 0.25f * sumf; | ||
| 1314 | |||
| 1315 | // #else | ||
| 1316 | |||
| 1317 | // uint32_t aux32[2]; | ||
| 1318 | // const uint8_t * aux8 = (const uint8_t *)aux32; | ||
| 1319 | |||
| 1320 | // float sumf = 0.f; | ||
| 1321 | // for (int i = 0; i < nb; ++i) { | ||
| 1322 | // const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; | ||
| 1323 | // const uint16_t * GGML_RESTRICT q2 = x[i].qs; | ||
| 1324 | // const int8_t * GGML_RESTRICT q8 = y[i].qs; | ||
| 1325 | // int32_t bsum = 0; | ||
| 1326 | // for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { | ||
| 1327 | // memcpy(aux32, q2, 2*sizeof(uint32_t)); | ||
| 1328 | // q2 += 4; | ||
| 1329 | // const uint32_t ls = 2*(aux32[1] >> 28) + 1; | ||
| 1330 | // int32_t sumi = 0; | ||
| 1331 | // for (int l = 0; l < 4; ++l) { | ||
| 1332 | // const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); | ||
| 1333 | // const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127]; | ||
| 1334 | // for (int j = 0; j < 8; ++j) { | ||
| 1335 | // sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); | ||
| 1336 | // } | ||
| 1337 | // q8 += 8; | ||
| 1338 | // } | ||
| 1339 | // bsum += sumi * ls; | ||
| 1340 | // } | ||
| 1341 | // sumf += d * bsum; | ||
| 1342 | // } | ||
| 1343 | // *s = 0.125f * sumf; | ||
| 1344 | // #endif | ||
| 1345 | // } | ||
| 1346 | |||
| 1347 | void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 1348 | assert(nrc == 1); | ||
| 1349 | UNUSED(nrc); | ||
| 1350 | UNUSED(bx); | ||
| 1351 | UNUSED(by); | ||
| 1352 | UNUSED(bs); | ||
| 1353 | assert(n % QK4_NL == 0); | ||
| 1354 | static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same"); | ||
| 1355 | |||
| 1356 | const block_iq4_nl * GGML_RESTRICT x = vx; | ||
| 1357 | const block_q8_0 * GGML_RESTRICT y = vy; | ||
| 1358 | |||
| 1359 | const int nb = n / QK4_NL; | ||
| 1360 | |||
| 1361 | int ib = 0; | ||
| 1362 | float sumf = 0; | ||
| 1363 | |||
| 1364 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 1365 | const int8x16_t v_k = vec_xl(0, kvalues_iq4nl); | ||
| 1366 | const uint8x16_t v_m = vec_splat_u8(0x0F); | ||
| 1367 | |||
| 1368 | for (; ib < nb; ++ib) { | ||
| 1369 | const block_iq4_nl * GGML_RESTRICT x0 = &x[ib]; | ||
| 1370 | const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; | ||
| 1371 | |||
| 1372 | const uint8x16_t v_x = vec_xl(0, x0->qs); | ||
| 1373 | int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); | ||
| 1374 | int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4); | ||
| 1375 | |||
| 1376 | v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl); | ||
| 1377 | v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh); | ||
| 1378 | |||
| 1379 | const int8x16_t v_yl = vec_xl(0 , y0->qs); | ||
| 1380 | const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs); | ||
| 1381 | const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); | ||
| 1382 | |||
| 1383 | sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy); | ||
| 1384 | } | ||
| 1385 | |||
| 1386 | *s = sumf; | ||
| 1387 | #else | ||
| 1388 | UNUSED(x); | ||
| 1389 | UNUSED(y); | ||
| 1390 | UNUSED(nb); | ||
| 1391 | UNUSED(ib); | ||
| 1392 | UNUSED(sumf); | ||
| 1393 | ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 1394 | #endif | ||
| 1395 | } | ||
| 1396 | |||
| 1397 | void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { | ||
| 1398 | assert(nrc == 1); | ||
| 1399 | UNUSED(nrc); | ||
| 1400 | UNUSED(bx); | ||
| 1401 | UNUSED(by); | ||
| 1402 | UNUSED(bs); | ||
| 1403 | assert(n % QK_K == 0); | ||
| 1404 | |||
| 1405 | const block_iq4_xs * GGML_RESTRICT x = vx; | ||
| 1406 | const block_q8_K * GGML_RESTRICT y = vy; | ||
| 1407 | |||
| 1408 | const int nb = n / QK_K; | ||
| 1409 | |||
| 1410 | #if defined(__VXE__) || defined(__VXE2__) | ||
| 1411 | const int8x16_t v_k = vec_xl(0, kvalues_iq4nl); | ||
| 1412 | const uint8x16_t v_m = vec_splat_u8(0x0F); | ||
| 1413 | |||
| 1414 | float sumf = 0; | ||
| 1415 | |||
| 1416 | for (int ibl = 0; ibl < nb; ++ibl) { | ||
| 1417 | const uint8_t * GGML_RESTRICT q4 = x[ibl].qs; | ||
| 1418 | const int8_t * GGML_RESTRICT q8 = y[ibl].qs; | ||
| 1419 | |||
| 1420 | uint16_t h = x[ibl].scales_h; | ||
| 1421 | |||
| 1422 | int sumi1 = 0, sumi2 = 0; | ||
| 1423 | for (int ib = 0; ib < QK_K/64; ++ib) { | ||
| 1424 | const uint8x16_t v_x0 = vec_xl(0 , q4); | ||
| 1425 | const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4); | ||
| 1426 | q4 += 32; | ||
| 1427 | |||
| 1428 | int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m); | ||
| 1429 | int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4); | ||
| 1430 | int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m); | ||
| 1431 | int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4); | ||
| 1432 | |||
| 1433 | v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l); | ||
| 1434 | v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h); | ||
| 1435 | v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l); | ||
| 1436 | v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h); | ||
| 1437 | |||
| 1438 | const int8x16_t v_y0 = vec_xl( 0, q8); | ||
| 1439 | const int8x16_t v_y1 = vec_xl(16, q8); | ||
| 1440 | const int8x16_t v_y2 = vec_xl(32, q8); | ||
| 1441 | const int8x16_t v_y3 = vec_xl(48, q8); | ||
| 1442 | q8 += 64; | ||
| 1443 | |||
| 1444 | int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1); | ||
| 1445 | int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3); | ||
| 1446 | |||
| 1447 | int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32; | ||
| 1448 | int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32; | ||
| 1449 | |||
| 1450 | h >>= 4; | ||
| 1451 | |||
| 1452 | sumi1 += vec_hsum_i32x4(vsumi0) * ls1; | ||
| 1453 | sumi2 += vec_hsum_i32x4(vsumi1) * ls2; | ||
| 1454 | } | ||
| 1455 | |||
| 1456 | sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); | ||
| 1457 | } | ||
| 1458 | |||
| 1459 | *s = sumf; | ||
| 1460 | |||
| 1461 | #else | ||
| 1462 | UNUSED(x); | ||
| 1463 | UNUSED(y); | ||
| 1464 | UNUSED(nb); | ||
| 1465 | ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); | ||
| 1466 | #endif | ||
| 1467 | } | ||
| 1468 | |||
