aboutsummaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-cpu/arch/s390
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-cpu/arch/s390
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/ggml/src/ggml-cpu/arch/s390')
-rw-r--r--llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp50
-rw-r--r--llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c1468
2 files changed, 1518 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp b/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp
new file mode 100644
index 0000000..5f4405a
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp
@@ -0,0 +1,50 @@
1#include "ggml-backend-impl.h"
2
3#if defined(__s390x__)
4#include <sys/auxv.h>
5
6// find hwcap bits in asm/elf.h
7#ifndef HWCAP_VXRS_EXT2
8#define HWCAP_VXRS_EXT2 (1 << 15)
9#endif
10
11#ifndef HWCAP_NNPA
12#define HWCAP_NNPA (1 << 20)
13#endif
14
15struct s390x_features {
16 bool has_vxe2 = false;
17 bool has_nnpa = false;
18
19 s390x_features() {
20 uint32_t hwcap = getauxval(AT_HWCAP);
21 // NOTE: use hwcap2 with DFLT for z17 and later
22 // uint32_t hwcap2 = getauxval(AT_HWCAP2);
23
24 has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
25 has_nnpa = !!(hwcap & HWCAP_NNPA);
26 }
27};
28
29static int ggml_backend_cpu_s390x_score() {
30 int score = 1;
31 s390x_features sf;
32
33// IBM z15 / LinuxONE 3
34#ifdef GGML_USE_VXE2
35 if (!sf.has_vxe2) { return 0; }
36 score += 1 << 1;
37#endif
38
39// IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
40#ifdef GGML_USE_NNPA
41 if (!sf.has_nnpa) { return 0; }
42 score += 1 << 2;
43#endif
44
45 return score;
46}
47
48GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
49
50#endif // __s390x__
diff --git a/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c b/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c
new file mode 100644
index 0000000..19d225a
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c
@@ -0,0 +1,1468 @@
1#define GGML_COMMON_IMPL_C
2#include "ggml-common.h"
3#include "ggml-quants.h"
4#include "ggml-impl.h"
5#include "ggml-cpu.h"
6#include "simd-mappings.h"
7
8#include "../../quants.h"
9#include "../../ggml-cpu-impl.h"
10
11#include <math.h>
12#include <string.h>
13#include <assert.h>
14#include <float.h>
15#include <stdlib.h> // for qsort
16#include <stdio.h> // for GGML_ASSERT
17
18#define GROUP_MAX_EPS 1e-15f
19#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
20#define GROUP_MAX_EPS_IQ2_S 1e-8f
21#define GROUP_MAX_EPS_IQ1_M 1e-7f
22#define GROUP_MAX_EPS_IQ1_S 1e-12f
23
24#define UNUSED GGML_UNUSED
25
26#if defined(__VXE__) || defined(__VXE2__)
27#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
28#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
29#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
30#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
31#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
32#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
33#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
34#define B8(c,s ) B7(c,s, c), B7(c,s, s)
35
36// precomputed tables for expanding 8bits to 8 bytes:
37static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4
38static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
39
40// permute mask for byteswapping
41static const uint8x16_t v_kperm = (const uint8x16_t){
42 7, 6, 5, 4, 3, 2, 1, 0,
43 15, 14, 13, 12, 11, 10, 9, 8
44};
45#endif
46
47void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
48 assert(QK8_0 == 32);
49 assert(k % QK8_0 == 0);
50 const int nb = k / QK8_0;
51
52 block_q8_0 * GGML_RESTRICT y = vy;
53
54#if defined(__VXE__) || defined(__VXE2__)
55 for (int i = 0; i < nb; i++) {
56 float32x4_t srcv [8];
57 float32x4_t asrcv[8];
58 float32x4_t amaxv[8];
59
60 for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
61 for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
62 for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
63 for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
64 for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
65
66 const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
67 vec_extract(amaxv[0], 1)),
68 MAX(vec_extract(amaxv[0], 2),
69 vec_extract(amaxv[0], 3)));
70
71 const float d = amax / ((1 << 7) - 1);
72 const float id = d ? 1.0f / d : 0.0f;
73
74 y[i].d = GGML_CPU_FP32_TO_FP16(d);
75
76 for (int j = 0; j < 8; j++) {
77 const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
78 /* Uses non-default rounding for vec_signed or vec_round */
79 const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
80
81 y[i].qs[4*j + 0] = vec_extract(vi, 0);
82 y[i].qs[4*j + 1] = vec_extract(vi, 1);
83 y[i].qs[4*j + 2] = vec_extract(vi, 2);
84 y[i].qs[4*j + 3] = vec_extract(vi, 3);
85 }
86 }
87#else
88 GGML_UNUSED(nb);
89 // scalar
90 quantize_row_q8_0_ref(x, y, k);
91#endif
92}
93
94void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
95 assert(k % QK8_1 == 0);
96 const int nb = k / QK8_1;
97
98 block_q8_1 * GGML_RESTRICT y = vy;
99
100#if defined(__VXE__) || defined(__VXE2__)
101 for (int i = 0; i < nb; i++) {
102 float32x4_t srcv [8];
103 float32x4_t asrcv[8];
104 float32x4_t amaxv[8];
105
106 for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
107 for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
108 for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
109 for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
110 for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
111
112 const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
113 vec_extract(amaxv[0], 1)),
114 MAX(vec_extract(amaxv[0], 2),
115 vec_extract(amaxv[0], 3)));
116
117 const float d = amax / ((1 << 7) - 1);
118 const float id = d ? 1.0f / d : 0.0f;
119
120 y[i].d = GGML_CPU_FP32_TO_FP16(d);
121
122 int32x4_t acc = vec_splats(0);
123
124 for (int j = 0; j < 8; j++) {
125 const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
126 /* Uses non-default rounding for vec_signed or vec_round */
127 const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
128
129 y[i].qs[4*j + 0] = vec_extract(vi, 0);
130 y[i].qs[4*j + 1] = vec_extract(vi, 1);
131 y[i].qs[4*j + 2] = vec_extract(vi, 2);
132 y[i].qs[4*j + 3] = vec_extract(vi, 3);
133
134 acc = vec_add(acc, vi);
135 }
136
137 y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
138 }
139#else
140 GGML_UNUSED(nb);
141 // scalar
142 quantize_row_q8_1_ref(x, y, k);
143#endif
144}
145
146
147//===================================== Dot products =================================
148
149void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
150 const int qk = QK8_0;
151 const int nb = n / qk;
152
153 assert(n % qk == 0);
154 assert(nrc == 1);
155 UNUSED(nrc);
156 UNUSED(bx);
157 UNUSED(by);
158 UNUSED(bs);
159
160 const block_q4_0 * GGML_RESTRICT x = vx;
161 const block_q8_0 * GGML_RESTRICT y = vy;
162
163 int ib = 0;
164 float sumf = 0;
165
166#if defined(__VXE__) || defined(__VXE2__)
167 float32x4_t acc = vec_splats(0.0f);
168
169 const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
170 const int8x16_t v_s = vec_splats( (const int8_t)0x08);
171
172 for (; ib < nb; ++ib) {
173 const uint8x16_t v_x = vec_xl(0, x[ib].qs);
174 const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
175 const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
176
177 const int8x16_t v_xls = vec_sub(v_xl, v_s);
178 const int8x16_t v_xhs = vec_sub(v_xh, v_s);
179
180 const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
181 const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
182
183 const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
184 const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
185 const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
186 const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
187
188 int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
189
190 const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
191 const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
192
193 acc = vec_madd(v_xy, v_d, acc);
194 }
195
196 sumf = vec_hsum_f32x4(acc);
197 *s = sumf;
198#else
199 UNUSED(nb);
200 UNUSED(x);
201 UNUSED(y);
202 UNUSED(ib);
203 UNUSED(sumf);
204 ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
205#endif
206}
207
208void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
209 const int qk = QK8_1;
210 const int nb = n / qk;
211
212 assert(n % qk == 0);
213 assert(nrc == 1);
214 UNUSED(nrc);
215 UNUSED(bx);
216 UNUSED(by);
217 UNUSED(bs);
218
219 const block_q4_1 * GGML_RESTRICT x = vx;
220 const block_q8_1 * GGML_RESTRICT y = vy;
221
222 int ib = 0;
223 float sumf = 0;
224
225#if defined(__VXE__) || defined(__VXE2__)
226 float summs = 0;
227 float32x4_t acc = vec_splats(0.0f);
228
229 const uint8x16_t v_m = vec_splat_u8(0x0F);
230
231#pragma GCC unroll 4
232 for (; ib < nb; ++ib) {
233 __builtin_prefetch(x[ib].qs, 0, 1);
234 __builtin_prefetch(y[ib].qs, 0, 1);
235
236 summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
237
238 const uint8x16_t v_x = vec_xl(0, x[ib].qs);
239 const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
240 const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
241
242 const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
243 const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
244
245 const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
246 const float32x4_t v_xy = vec_float(v_xy_);
247
248 const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
249
250 acc = vec_madd(v_xy, v_d, acc);
251 }
252
253 sumf = vec_hsum_f32x4(acc) + summs;
254 *s = sumf;
255#else
256 UNUSED(nb);
257 UNUSED(x);
258 UNUSED(y);
259 UNUSED(ib);
260 UNUSED(sumf);
261 ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
262#endif
263}
264
265void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
266 assert(nrc == 1);
267 UNUSED(nrc);
268 UNUSED(bx);
269 UNUSED(by);
270 UNUSED(bs);
271 assert(n % QK_MXFP4 == 0);
272 static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
273
274 const int qk = QK_MXFP4;
275 const int nb = n / qk;
276
277 const block_mxfp4 * GGML_RESTRICT x = vx;
278 const block_q8_0 * GGML_RESTRICT y = vy;
279
280 int ib = 0;
281 float sumf = 0.0f;
282
283#if defined(__VXE__) || defined(__VXE2__)
284 const int8x16_t v_k = vec_xl(0, kvalues_mxfp4);
285 const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
286
287 float32x4_t v_acc = vec_splats(0.0f);
288
289 #pragma GCC unroll 8
290 for (; ib + 1 < nb; ib += 2) {
291 const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
292 const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
293 const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
294 const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
295
296 const uint8x16_t v_x0 = vec_xl(0, x0->qs);
297 const uint8x16_t v_x1 = vec_xl(0, x1->qs);
298
299 int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
300 int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
301 int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
302 int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
303
304 v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
305 v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
306 v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
307 v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
308
309 const int8x16_t v_y0l = vec_xl(0, y0->qs);
310 const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
311 const int8x16_t v_y1l = vec_xl(0, y1->qs);
312 const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
313
314 const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
315 const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
316
317 const float32x4_t v_xy0f = vec_float(v_xy0);
318 const float32x4_t v_xy1f = vec_float(v_xy1);
319
320 const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
321 const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
322
323 v_acc = vec_madd(v_xy0f, v_d0, v_acc);
324 v_acc = vec_madd(v_xy1f, v_d1, v_acc);
325 }
326
327 for (; ib < nb; ++ib) {
328 const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
329 const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
330
331 const uint8x16_t v_x = vec_xl(0, x0->qs);
332
333 int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
334 int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
335
336 v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
337 v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
338
339 const int8x16_t v_yl = vec_xl(0, y0->qs);
340 const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
341
342 const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
343 const float32x4_t v_xyf = vec_float(v_xy);
344
345 const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
346 v_acc = vec_madd(v_xyf, v_d, v_acc);
347 }
348
349 sumf = vec_hsum_f32x4(v_acc);
350 *s = sumf;
351#else
352 UNUSED(x);
353 UNUSED(y);
354 UNUSED(ib);
355 UNUSED(sumf);
356 ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
357#endif
358}
359
360void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
361 const int qk = QK8_0;
362 const int nb = n / qk;
363
364 assert(n % qk == 0);
365 assert(qk == QK5_0);
366 assert(nrc == 1);
367 UNUSED(nrc);
368 UNUSED(bx);
369 UNUSED(by);
370 UNUSED(bs);
371
372 const block_q5_0 * GGML_RESTRICT x = vx;
373 const block_q8_0 * GGML_RESTRICT y = vy;
374
375 int ib = 0;
376 float sumf = 0.0f;
377
378#if defined(__VXE__) || defined(__VXE2__)
379 float32x4_t v_sum0 = vec_splats(0.0f);
380 float32x4_t v_sum1 = vec_splats(0.0f);
381
382 uint32_t qh0, qh1;
383 uint64_t tmp0[4], tmp1[4];
384
385 const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
386
387 #pragma GCC unroll 4
388 for (; ib + 1 < nb; ib += 2) {
389 const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0];
390 const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
391 const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
392 const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
393
394 memcpy(&qh0, x0->qh, sizeof(qh0));
395 memcpy(&qh1, x1->qh, sizeof(qh1));
396
397 tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF];
398 tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF];
399 tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
400 tmp0[3] = table_b2b_1[(qh0 >> 24) ];
401
402 tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF];
403 tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF];
404 tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
405 tmp1[3] = table_b2b_1[(qh1 >> 24) ];
406
407 int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
408 int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
409 int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
410 int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
411
412 // required for fixing the byteorder
413 v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
414 v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
415 v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
416 v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
417
418 const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs);
419 const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs);
420
421 int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
422 int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
423 int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
424 int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
425
426 const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l);
427 const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h);
428 const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l);
429 const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h);
430
431 const int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs);
432 const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
433 const int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs);
434 const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
435
436 const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
437 const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
438
439 const float32x4_t v_xy0f = vec_float(v_xy0);
440 const float32x4_t v_xy1f = vec_float(v_xy1);
441
442 const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
443 const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
444
445 v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
446 v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
447 }
448
449 sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
450
451 #pragma GCC unroll 4
452 for (; ib < nb; ++ib) {
453 const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
454 const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
455
456 uint32_t qh;
457 memcpy(&qh, x0->qh, sizeof(qh));
458
459 uint64_t tmp[4];
460 tmp[0] = table_b2b_1[(qh >> 0) & 0xFF];
461 tmp[1] = table_b2b_1[(qh >> 8) & 0xFF];
462 tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
463 tmp[3] = table_b2b_1[(qh >> 24) ];
464
465 int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
466 int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
467
468 // required for fixing the byteorder
469 v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
470 v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
471
472 const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs);
473 int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
474 int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
475
476 const int8x16_t v_xlf = vec_sub(v_xl, v_qhl);
477 const int8x16_t v_xhf = vec_sub(v_xh, v_qhh);
478
479 const int8x16_t v_yl = vec_xl(0, (const int8_t *)y0->qs);
480 const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
481
482 const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
483 const float32x4_t v_xyf = vec_float(v_xy);
484
485 const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
486 const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
487
488 sumf += vec_hsum_f32x4(v_acc);
489 }
490
491 *s = sumf;
492#else
493 UNUSED(nb);
494 UNUSED(x);
495 UNUSED(y);
496 UNUSED(ib);
497 UNUSED(sumf);
498 ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
499#endif
500}
501
502void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
503 const int qk = QK8_1;
504 const int nb = n / qk;
505
506 assert(n % qk == 0);
507 assert(qk == QK5_1);
508 assert(nrc == 1);
509 UNUSED(nrc);
510 UNUSED(bx);
511 UNUSED(by);
512 UNUSED(bs);
513
514 const block_q5_1 * GGML_RESTRICT x = vx;
515 const block_q8_1 * GGML_RESTRICT y = vy;
516
517 int ib = 0;
518 float sumf = 0.0f;
519
520#if defined(__VXE__) || defined(__VXE2__)
521 float32x4_t v_sum0 = vec_splats(0.0f);
522 float32x4_t v_sum1 = vec_splats(0.0f);
523
524 float summs0 = 0.0f;
525 float summs1 = 0.0f;
526
527 uint32_t qh0;
528 uint32_t qh1;
529
530 uint64_t tmp0[4];
531 uint64_t tmp1[4];
532
533 const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
534
535 #pragma GCC unroll 4
536 for (; ib + 1 < nb; ib += 2) {
537 const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0];
538 const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
539 const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
540 const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
541
542 summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
543 summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
544
545 memcpy(&qh0, x0->qh, sizeof(qh0));
546 memcpy(&qh1, x1->qh, sizeof(qh1));
547
548 tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF];
549 tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF];
550 tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
551 tmp0[3] = table_b2b_0[(qh0 >> 24) ];
552
553 tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF];
554 tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF];
555 tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
556 tmp1[3] = table_b2b_0[(qh1 >> 24) ];
557
558 int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
559 int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
560 int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
561 int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
562
563 // required for fixing the byteorder
564 v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
565 v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
566 v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
567 v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
568
569 const uint8x16_t v_x0 = vec_xl(0, x0->qs);
570 const uint8x16_t v_x1 = vec_xl(0, x1->qs);
571
572 const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
573 const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
574 const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
575 const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
576
577 const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l);
578 const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h);
579 const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l);
580 const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h);
581
582 const int8x16_t v_y0l = vec_xl(0 , y0->qs);
583 const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs);
584 const int8x16_t v_y1l = vec_xl(0 , y1->qs);
585 const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs);
586
587 const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
588 const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
589
590 const float32x4_t v_xy0f = vec_float(v_xy0);
591 const float32x4_t v_xy1f = vec_float(v_xy1);
592
593 const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
594 const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
595
596 v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
597 v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
598 }
599
600 sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
601
602 #pragma GCC unroll 4
603 for (; ib < nb; ++ib) {
604 const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
605 const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
606
607 float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
608
609 uint32_t qh;
610 memcpy(&qh, x0->qh, sizeof(qh));
611
612 uint64_t tmp[4];
613 tmp[0] = table_b2b_0[(qh >> 0) & 0xFF];
614 tmp[1] = table_b2b_0[(qh >> 8) & 0xFF];
615 tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
616 tmp[3] = table_b2b_0[(qh >> 24) ];
617
618 int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
619 int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
620
621 // required for fixing the byteorder
622 v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
623 v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
624
625 const uint8x16_t v_x = vec_xl(0, x0->qs);
626 const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
627 const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
628
629 const int8x16_t v_xlf = vec_or(v_xl, v_qhl);
630 const int8x16_t v_xhf = vec_or(v_xh, v_qhh);
631
632 const int8x16_t v_yl = vec_xl(0 , y0->qs);
633 const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs);
634
635 const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
636 const float32x4_t v_xyf = vec_float(v_xy);
637
638 const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
639 const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
640
641 sumf += vec_hsum_f32x4(v_acc) + summs;
642 }
643
644 *s = sumf;
645#else
646 UNUSED(nb);
647 UNUSED(x);
648 UNUSED(y);
649 UNUSED(ib);
650 UNUSED(sumf);
651 ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
652#endif
653}
654
655void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
656 const int qk = QK8_0;
657 const int nb = n / qk;
658
659 assert(n % qk == 0);
660 assert(nrc == 1);
661 UNUSED(nrc);
662 UNUSED(bx);
663 UNUSED(by);
664 UNUSED(bs);
665
666 const block_q8_0 * GGML_RESTRICT x = vx;
667 const block_q8_0 * GGML_RESTRICT y = vy;
668
669 int ib = 0;
670 float sumf = 0;
671
672#if defined(__VXE__) || defined(__VXE2__)
673 float32x4_t acc = vec_splats(0.0f);
674
675#pragma GCC unroll 8
676 for (; ib < nb; ++ib) {
677 __builtin_prefetch(x[ib].qs, 0, 1);
678 __builtin_prefetch(y[ib].qs, 0, 1);
679
680 const int8x16_t v_xl = vec_xl(0 , x[ib].qs);
681 const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
682 const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
683 const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
684
685 const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
686 const float32x4_t v_xy = vec_float(v_xy_);
687 const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
688
689 acc = vec_madd(v_xy, v_d, acc);
690 }
691
692 sumf = vec_hsum_f32x4(acc);
693
694 *s = sumf;
695#else
696 UNUSED(nb);
697 UNUSED(x);
698 UNUSED(y);
699 UNUSED(ib);
700 UNUSED(sumf);
701 ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
702#endif
703}
704
705void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
706 assert(n % QK_K == 0);
707 assert(nrc == 1);
708 UNUSED(nrc);
709 UNUSED(bx);
710 UNUSED(by);
711 UNUSED(bs);
712
713 const uint32_t kmask1 = 0x03030303;
714 const uint32_t kmask2 = 0x0f0f0f0f;
715
716 const block_q3_K * GGML_RESTRICT x = vx;
717 const block_q8_K * GGML_RESTRICT y = vy;
718
719 const int nb = n / QK_K;
720
721#if defined(__VXE__) || defined(__VXE2__)
722 uint32_t aux[3];
723 uint32_t utmp[4];
724
725 const int32x4_t v_z = vec_splat_s32(0);
726 const uint8x16_t v_3m = vec_splat_u8(0x03);
727
728 const uint8x16_t v_0c = vec_splat_u8(1);
729 const uint8x16_t v_1c = vec_sl(v_0c, 1);
730 const uint8x16_t v_2c = vec_sl(v_0c, 2);
731 const uint8x16_t v_3c = vec_sl(v_0c, 3);
732
733 uint8x16_t q3h[4];
734 uint8x16_t q3b[2];
735 int8x16_t q3bytes[4];
736 int8x16_t q8bytes[8];
737 uint8x16_t qhbits[2];
738
739 float sum = 0;
740
741 for (int i = 0; i < nb; ++i) {
742 const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
743
744 const uint8_t * restrict x0l = x[i].qs;
745 const uint8_t * restrict x0h = x[i].hmask;
746 const int8_t * restrict y0 = y[i].qs;
747
748 qhbits[0] = vec_xl(0 , x0h);
749 qhbits[1] = vec_xl(16, x0h);
750
751 int32_t isum = 0;
752
753 memcpy(aux, x[i].scales, 12);
754 utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
755 utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
756 utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
757 utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
758
759 int8_t * scale = (int8_t *)utmp;
760 for (int j = 0; j < 16; ++j) scale[j] -= 32;
761
762 for (int j = 0; j < QK_K/128; ++j) {
763 int32x4_t isum0, isum1, isum2, isum3;
764
765 q3b[0] = vec_xl(0 , x0l);
766 q3b[1] = vec_xl(16, x0l);
767 x0l += 32;
768
769 q8bytes[0] = vec_xl(0 , y0);
770 q8bytes[1] = vec_xl(16 , y0);
771 q8bytes[2] = vec_xl(32 , y0);
772 q8bytes[3] = vec_xl(48 , y0);
773 q8bytes[4] = vec_xl(64 , y0);
774 q8bytes[5] = vec_xl(80 , y0);
775 q8bytes[6] = vec_xl(96 , y0);
776 q8bytes[7] = vec_xl(112, y0);
777 y0 += 128;
778
779 q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
780 q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
781 q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
782 q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
783
784 q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
785 q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
786 q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
787 q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
788
789 isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
790 isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
791 isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
792 isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
793
794 isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
795 isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
796 isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
797 isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
798
799 scale += 4;
800
801 q3h[0] = vec_andc(v_2c, qhbits[0]);
802 q3h[1] = vec_andc(v_2c, qhbits[1]);
803 q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
804 q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
805
806 q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
807 q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
808 q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
809 q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
810
811 isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
812 isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
813 isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
814 isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
815
816 isum += vec_hsum_i32x4(isum0) * scale[0];
817 isum += vec_hsum_i32x4(isum1) * scale[1];
818 isum += vec_hsum_i32x4(isum2) * scale[2];
819 isum += vec_hsum_i32x4(isum3) * scale[3];
820
821 scale += 4;
822
823 if (j == 0) {
824 qhbits[0] = vec_sr(qhbits[0], 4);
825 qhbits[1] = vec_sr(qhbits[1], 4);
826 }
827 }
828
829 sum += d * isum;
830 }
831
832 *s = sum;
833
834#else
835 UNUSED(kmask1);
836 UNUSED(kmask2);
837 UNUSED(x);
838 UNUSED(y);
839 UNUSED(nb);
840 ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
841#endif
842}
843
844void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
845 assert(n % QK_K == 0);
846 assert(nrc == 1);
847 UNUSED(nrc);
848 UNUSED(bx);
849 UNUSED(by);
850 UNUSED(bs);
851
852 const block_q4_K * GGML_RESTRICT x = vx;
853 const block_q8_K * GGML_RESTRICT y = vy;
854
855 const int nb = n / QK_K;
856
857 static const uint32_t kmask1 = 0x3f3f3f3f;
858 static const uint32_t kmask2 = 0x0f0f0f0f;
859 static const uint32_t kmask3 = 0x03030303;
860
861 uint32_t utmp[4];
862
863#if defined(__VXE__) || defined(__VXE2__)
864 const uint8x16_t v_lm = vec_splat_u8(0x0F);
865 const int32x4_t v_z = vec_splat_s32(0);
866
867 uint8x16_t v_x[2];
868 int8x16_t v_xl[2];
869 int8x16_t v_y[2];
870
871 float sumf = 0;
872
873 for (int i = 0; i < nb; ++i) {
874 const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
875 const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
876
877 const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
878 const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
879 const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
880
881 memcpy(utmp, x[i].scales, 12);
882
883 uint32x4_t v_mins8 = { 0 };
884 v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
885 v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
886
887 utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
888 utmp[0] &= kmask1;
889
890 const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
891
892 const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
893 const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
894 const int32x4_t v_mins = v_minso + v_minse;
895 sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
896
897 const uint8_t * scales = (const uint8_t *)utmp;
898 const uint8_t * GGML_RESTRICT x0 = x[i].qs;
899 const int8_t * GGML_RESTRICT y0 = y[i].qs;
900
901 int32_t sumi1 = 0;
902 int32_t sumi2 = 0;
903
904 for (int j = 0; j < QK_K/64; ++j) {
905 v_x[0] = vec_xl(0 , x0);
906 v_x[1] = vec_xl(16, x0);
907 x0 += 32;
908
909 v_y[0] = vec_xl(0 , y0);
910 v_y[1] = vec_xl(16, y0);
911 y0 += 32;
912
913 v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
914 v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
915
916 const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
917 sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
918
919 v_y[0] = vec_xl(0 , y0);
920 v_y[1] = vec_xl(16, y0);
921 y0 += 32;
922
923 v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
924 v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
925
926 const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
927 sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
928 }
929
930 sumf += d * (sumi1 + sumi2);
931 }
932
933 *s = sumf;
934
935#else
936 UNUSED(x);
937 UNUSED(y);
938 UNUSED(nb);
939 UNUSED(kmask1);
940 UNUSED(kmask2);
941 UNUSED(kmask3);
942 UNUSED(utmp);
943 ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
944#endif
945}
946
947void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
948 assert(n % QK_K == 0);
949 assert(nrc == 1);
950 UNUSED(nrc);
951 UNUSED(bx);
952 UNUSED(by);
953 UNUSED(bs);
954
955 const block_q5_K * GGML_RESTRICT x = vx;
956 const block_q8_K * GGML_RESTRICT y = vy;
957
958 const int nb = n / QK_K;
959
960 static const uint32_t kmask1 = 0x3f3f3f3f;
961 static const uint32_t kmask2 = 0x0f0f0f0f;
962 static const uint32_t kmask3 = 0x03030303;
963
964 uint32_t utmp[4];
965
966#if defined(__VXE__) || defined(__VXE2__)
967 const uint8x16_t v_lm = vec_splat_u8(0x0F);
968 const uint8x16_t v_1m = vec_splat_u8(0x01);
969 const uint8x16_t v_2m = vec_splat_u8(0x02);
970
971 const int32x4_t v_z = vec_splat_s32(0);
972
973 const uchar8x16_t v_minsm = {
974 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
975 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
976 };
977
978 int8x16_t q5b[4];
979 uint8x16_t q5h[4];
980
981 uint8x16_t v_xl[2];
982 uint8x16_t v_xh[2];
983 int8x16_t v_y[4];
984
985 float sumf = 0;
986
987 for (int i = 0; i < nb; ++i) {
988 const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
989 const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
990
991 const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
992 const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
993 const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
994
995 memcpy(utmp, x[i].scales, 12);
996 utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
997 const uint32_t uaux = utmp[1] & kmask1;
998 utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
999 utmp[2] = uaux;
1000 utmp[0] &= kmask1;
1001
1002 const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
1003 const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
1004 const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
1005
1006 const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
1007 const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
1008 const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
1009 const int32_t mins = vec_hsum_i32x4(v_mins);
1010
1011 const uint8_t * scales = (const uint8_t *)utmp;
1012 const uint8_t * GGML_RESTRICT x0l = x[i].qs;
1013 const uint8_t * GGML_RESTRICT x0h = x[i].qh;
1014 const int8_t * GGML_RESTRICT y0 = y[i].qs;
1015
1016 v_xh[0] = vec_xl(0 , x0h);
1017 v_xh[1] = vec_xl(16, x0h);
1018
1019 int32_t sumi = 0;
1020 for (int j = 0; j < QK_K/64; ++j) {
1021 v_xl[0] = vec_xl(0 , x0l);
1022 v_xl[1] = vec_xl(16, x0l);
1023 x0l += 32;
1024
1025 v_y[0] = vec_xl(0 , y0);
1026 v_y[1] = vec_xl(16, y0);
1027 v_y[2] = vec_xl(32, y0);
1028 v_y[3] = vec_xl(48, y0);
1029 y0 += 64;
1030
1031 q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
1032 q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
1033 q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
1034 q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
1035 v_xh[0] = vec_sr(v_xh[0], 2);
1036 v_xh[1] = vec_sr(v_xh[1], 2);
1037
1038 q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
1039 q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
1040 q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
1041 q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
1042
1043 int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
1044 int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
1045
1046 sumi += vec_hsum_i32x4(sumi0) * *scales++;
1047 sumi += vec_hsum_i32x4(sumi1) * *scales++;
1048 }
1049
1050 sumf += d * sumi - dmin * mins;
1051 }
1052
1053 *s = sumf;
1054
1055#else
1056 UNUSED(x);
1057 UNUSED(y);
1058 UNUSED(nb);
1059 UNUSED(kmask1);
1060 UNUSED(kmask2);
1061 UNUSED(kmask3);
1062 UNUSED(utmp);
1063 ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1064#endif
1065}
1066
1067void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1068 assert(n % QK_K == 0);
1069 assert(nrc == 1);
1070 UNUSED(nrc);
1071 UNUSED(bx);
1072 UNUSED(by);
1073 UNUSED(bs);
1074
1075 const block_q6_K * GGML_RESTRICT x = vx;
1076 const block_q8_K * GGML_RESTRICT y = vy;
1077
1078 const int nb = n / QK_K;
1079
1080#if defined(__VXE__) || defined(__VXE2__)
1081 float sum = 0;
1082
1083 // Lower 4-bit and upper 2-bit masks
1084 const uint8x16_t v_lm = vec_splat_u8(0x0F);
1085 const uint8x16_t v_um = vec_splat_u8(0x03);
1086
1087 const int32x4_t v_z = vec_splat_s32(0);
1088
1089 int8x16_t q6b[4];
1090 uint8x16_t q6h[4];
1091
1092 uint8x16_t v_xl[4];
1093 uint8x16_t v_xh[2];
1094 int8x16_t v_y[4];
1095
1096 for (int i = 0; i < nb; ++i) {
1097 const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
1098
1099 const uint8_t * GGML_RESTRICT x0l = x[i].ql;
1100 const uint8_t * GGML_RESTRICT x0h = x[i].qh;
1101 const int8_t * GGML_RESTRICT y0 = y[i].qs;
1102
1103 const int8_t * GGML_RESTRICT scale = x[i].scales;
1104
1105 const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
1106 const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
1107
1108 const int8x16_t v_scale = vec_xl(0, scale);
1109 const int16x8_t v_scalel = vec_unpackh(v_scale);
1110 const int16x8_t v_scaleh = vec_unpackl(v_scale);
1111
1112 const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
1113 const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
1114 const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
1115 const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
1116 const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
1117
1118 const int32_t mins = vec_hsum_i32x4(v_mins);
1119
1120 int32_t isum = 0;
1121 for (int j = 0; j < QK_K/128; ++j) {
1122 // Load model upper 2 bits
1123 v_xh[0] = vec_xl(0 , x0h);
1124 v_xh[1] = vec_xl(16, x0h);
1125 x0h += 32;
1126
1127 // Load model lower 4 bits
1128 v_xl[0] = vec_xl(0 , x0l);
1129 v_xl[1] = vec_xl(16, x0l);
1130 v_xl[2] = vec_xl(32, x0l);
1131 v_xl[3] = vec_xl(48, x0l);
1132 x0l += 64;
1133
1134 // Load activation quants
1135 v_y[0] = vec_xl(0 , y0);
1136 v_y[1] = vec_xl(16, y0);
1137 v_y[2] = vec_xl(32, y0);
1138 v_y[3] = vec_xl(48, y0);
1139 y0 += 64;
1140
1141 q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
1142 q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
1143 uint8x16_t shifted = vec_sr(v_xh[0], 2);
1144 q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
1145 shifted = vec_sr(v_xh[1], 2);
1146 q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
1147
1148 q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
1149 q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
1150 q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
1151 q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
1152
1153 int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
1154 int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
1155 int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
1156 int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
1157
1158 isum += vec_hsum_i32x4(summs0) * scale[0] +
1159 vec_hsum_i32x4(summs1) * scale[1] +
1160 vec_hsum_i32x4(summs2) * scale[2] +
1161 vec_hsum_i32x4(summs3) * scale[3];
1162
1163 scale += 4;
1164
1165
1166 // Load activation quants
1167 v_y[0] = vec_xl(0 , y0);
1168 v_y[1] = vec_xl(16, y0);
1169 v_y[2] = vec_xl(32, y0);
1170 v_y[3] = vec_xl(48, y0);
1171 y0 += 64;
1172
1173 shifted = vec_sr(v_xh[0], 4);
1174 q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
1175 shifted = vec_sr(v_xh[1], 4);
1176 q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
1177 shifted = vec_sr(v_xh[0], 6);
1178 q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
1179 shifted = vec_sr(v_xh[1], 6);
1180 q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
1181
1182 q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
1183 q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
1184 q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
1185 q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
1186
1187 summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
1188 summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
1189 summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
1190 summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
1191
1192 isum += vec_hsum_i32x4(summs0) * scale[0] +
1193 vec_hsum_i32x4(summs1) * scale[1] +
1194 vec_hsum_i32x4(summs2) * scale[2] +
1195 vec_hsum_i32x4(summs3) * scale[3];
1196
1197 scale += 4;
1198 }
1199
1200 sum += d_all * y[i].d * (isum - 32 * mins);
1201 }
1202
1203 *s = sum;
1204
1205#else
1206 UNUSED(x);
1207 UNUSED(y);
1208 UNUSED(nb);
1209 ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1210#endif
1211}
1212
1213// #if defined(__VXE__) || defined(__VXE2__)
1214// static const int8_t keven_signs_q2xs[1024] = {
1215// 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
1216// 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
1217// 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
1218// 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
1219// 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
1220// 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
1221// 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
1222// 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
1223// 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
1224// 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
1225// 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
1226// 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
1227// 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
1228// 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
1229// 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
1230// 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
1231// 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
1232// 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
1233// 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
1234// 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
1235// 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
1236// 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
1237// 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
1238// 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
1239// 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
1240// 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
1241// 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
1242// 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
1243// 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
1244// 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
1245// 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
1246// 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
1247// };
1248// #endif
1249
1250// void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1251// assert(n % QK_K == 0);
1252// assert(nrc == 1);
1253// UNUSED(nrc);
1254// UNUSED(bx);
1255// UNUSED(by);
1256// UNUSED(bs);
1257
1258// const block_iq2_xxs * GGML_RESTRICT x = vx;
1259// const block_q8_K * GGML_RESTRICT y = vy;
1260
1261// const int nb = n / QK_K;
1262
1263// #if defined(__VXE__) || defined(__VXE2__)
1264// const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1265
1266// uint32_t aux32[4];
1267// const uint8_t * aux8 = (const uint8_t *)aux32;
1268
1269// float sumf = 0;
1270
1271// for (int i = 0; i < nb; ++i) {
1272// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1273// const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1274// const int8_t * GGML_RESTRICT q8 = y[i].qs;
1275
1276// float sumf1 = 0, sumf2 = 0;
1277
1278// for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
1279// int8x16_t q8b0 = vec_xl( 0, q8);
1280// int8x16_t qb81 = vec_xl(16, q8);
1281// int8x16_t q8b2 = vec_xl(32, q8);
1282// int8x16_t q8b3 = vec_xl(48, q8);
1283// q8 += 64;
1284
1285// memcpy(aux32, q2, 4 * sizeof(uint32_t));
1286// q2 += 8;
1287
1288// int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
1289// int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
1290// int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
1291// int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
1292
1293// int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127)) };
1294// int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
1295// int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127)) };
1296// int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
1297
1298// q2u0 = vec_mul(q2u0, q2s0);
1299// q2u1 = vec_mul(q2u1, q2s1);
1300// q2u2 = vec_mul(q2u2, q2s2);
1301// q2u3 = vec_mul(q2u3, q2s3);
1302
1303// const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
1304// const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
1305
1306// sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
1307// sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
1308// }
1309
1310// sumf += d * (sumf1 + sumf2);
1311// }
1312
1313// *s = 0.25f * sumf;
1314
1315// #else
1316
1317// uint32_t aux32[2];
1318// const uint8_t * aux8 = (const uint8_t *)aux32;
1319
1320// float sumf = 0.f;
1321// for (int i = 0; i < nb; ++i) {
1322// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1323// const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1324// const int8_t * GGML_RESTRICT q8 = y[i].qs;
1325// int32_t bsum = 0;
1326// for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1327// memcpy(aux32, q2, 2*sizeof(uint32_t));
1328// q2 += 4;
1329// const uint32_t ls = 2*(aux32[1] >> 28) + 1;
1330// int32_t sumi = 0;
1331// for (int l = 0; l < 4; ++l) {
1332// const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
1333// const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
1334// for (int j = 0; j < 8; ++j) {
1335// sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1336// }
1337// q8 += 8;
1338// }
1339// bsum += sumi * ls;
1340// }
1341// sumf += d * bsum;
1342// }
1343// *s = 0.125f * sumf;
1344// #endif
1345// }
1346
1347void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1348 assert(nrc == 1);
1349 UNUSED(nrc);
1350 UNUSED(bx);
1351 UNUSED(by);
1352 UNUSED(bs);
1353 assert(n % QK4_NL == 0);
1354 static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
1355
1356 const block_iq4_nl * GGML_RESTRICT x = vx;
1357 const block_q8_0 * GGML_RESTRICT y = vy;
1358
1359 const int nb = n / QK4_NL;
1360
1361 int ib = 0;
1362 float sumf = 0;
1363
1364#if defined(__VXE__) || defined(__VXE2__)
1365 const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
1366 const uint8x16_t v_m = vec_splat_u8(0x0F);
1367
1368 for (; ib < nb; ++ib) {
1369 const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
1370 const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
1371
1372 const uint8x16_t v_x = vec_xl(0, x0->qs);
1373 int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
1374 int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
1375
1376 v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
1377 v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
1378
1379 const int8x16_t v_yl = vec_xl(0 , y0->qs);
1380 const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
1381 const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
1382
1383 sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
1384 }
1385
1386 *s = sumf;
1387#else
1388 UNUSED(x);
1389 UNUSED(y);
1390 UNUSED(nb);
1391 UNUSED(ib);
1392 UNUSED(sumf);
1393 ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
1394#endif
1395}
1396
1397void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1398 assert(nrc == 1);
1399 UNUSED(nrc);
1400 UNUSED(bx);
1401 UNUSED(by);
1402 UNUSED(bs);
1403 assert(n % QK_K == 0);
1404
1405 const block_iq4_xs * GGML_RESTRICT x = vx;
1406 const block_q8_K * GGML_RESTRICT y = vy;
1407
1408 const int nb = n / QK_K;
1409
1410#if defined(__VXE__) || defined(__VXE2__)
1411 const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
1412 const uint8x16_t v_m = vec_splat_u8(0x0F);
1413
1414 float sumf = 0;
1415
1416 for (int ibl = 0; ibl < nb; ++ibl) {
1417 const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
1418 const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
1419
1420 uint16_t h = x[ibl].scales_h;
1421
1422 int sumi1 = 0, sumi2 = 0;
1423 for (int ib = 0; ib < QK_K/64; ++ib) {
1424 const uint8x16_t v_x0 = vec_xl(0 , q4);
1425 const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
1426 q4 += 32;
1427
1428 int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
1429 int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
1430 int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
1431 int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
1432
1433 v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
1434 v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
1435 v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
1436 v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
1437
1438 const int8x16_t v_y0 = vec_xl( 0, q8);
1439 const int8x16_t v_y1 = vec_xl(16, q8);
1440 const int8x16_t v_y2 = vec_xl(32, q8);
1441 const int8x16_t v_y3 = vec_xl(48, q8);
1442 q8 += 64;
1443
1444 int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
1445 int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
1446
1447 int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
1448 int ls2 = ((x[ibl].scales_l[ib] >> 4) | ((h << 2) & 0x30)) - 32;
1449
1450 h >>= 4;
1451
1452 sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
1453 sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
1454 }
1455
1456 sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
1457 }
1458
1459 *s = sumf;
1460
1461#else
1462 UNUSED(x);
1463 UNUSED(y);
1464 UNUSED(nb);
1465 ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1466#endif
1467}
1468