1#define GGML_COMMON_IMPL_C
2#include "ggml-common.h"
3#include "ggml-quants.h"
4#include "ggml-impl.h"
5#include "ggml-cpu.h"
6#include "simd-mappings.h"
7
8#include "../../quants.h"
9#include "../../ggml-cpu-impl.h"
10
11#include <math.h>
12#include <string.h>
13#include <assert.h>
14#include <float.h>
15#include <stdlib.h> // for qsort
16#include <stdio.h> // for GGML_ASSERT
17
18#define GROUP_MAX_EPS 1e-15f
19#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
20#define GROUP_MAX_EPS_IQ2_S 1e-8f
21#define GROUP_MAX_EPS_IQ1_M 1e-7f
22#define GROUP_MAX_EPS_IQ1_S 1e-12f
23
24#define UNUSED GGML_UNUSED
25
26#if defined(__POWER9_VECTOR__)
27#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
28#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
29#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
30#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
31#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
32#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
33#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
34#define B8(c,s ) B7(c,s, c), B7(c,s, s)
35
36// precomputed tables for expanding 8bits to 8 bytes:
37static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
38static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
39#endif
40
41void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
42 assert(QK8_0 == 32);
43 assert(k % QK8_0 == 0);
44 const int nb = k / QK8_0;
45
46 block_q8_0 * GGML_RESTRICT y = vy;
47
48#if defined(__POWER9_VECTOR__)
49 for (int i = 0; i < nb; i++) {
50 vector float srcv [8];
51 vector float asrcv[8];
52 vector float amaxv[8];
53 vector signed int vi[8];
54
55 for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
56 for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
57
58 for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
59 for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
60 for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
61
62 const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
63 vec_extract(amaxv[0], 1)),
64 MAX(vec_extract(amaxv[0], 2),
65 vec_extract(amaxv[0], 3)));
66
67 const float d = amax / ((1 << 7) - 1);
68 const float id = d ? 1.0f/d : 0.0f;
69 const vector float vid = vec_splats(id);
70
71 y[i].d = GGML_CPU_FP32_TO_FP16(d);
72
73 for (int j = 0; j < 8; j++) {
74 const vector float v = vec_round(vec_mul(srcv[j], vid));
75 vi[j] = vec_cts(v, 0);
76 }
77 vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
78 vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
79 }
80#else
81 GGML_UNUSED(nb);
82 // scalar
83 quantize_row_q8_0_ref(x, y, k);
84#endif
85}
86
87void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
88 assert(k % QK8_1 == 0);
89 const int nb = k / QK8_1;
90
91 block_q8_1 * GGML_RESTRICT y = vy;
92
93#if defined(__POWER9_VECTOR__)
94 for (int i = 0; i < nb; i++) {
95 vector float srcv [8];
96 vector float asrcv[8];
97 vector float amaxv[8];
98 vector signed int vi[8];
99
100 for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
101 for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
102
103 for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
104 for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
105 for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
106
107 const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
108 vec_extract(amaxv[0], 1)),
109 MAX(vec_extract(amaxv[0], 2),
110 vec_extract(amaxv[0], 3)));
111
112 const float d = amax / ((1 << 7) - 1);
113 const float id = d ? 1.0f/d : 0.0f;
114 const vector float vid = vec_splats(id);
115
116 y[i].d = GGML_CPU_FP32_TO_FP16(d);
117
118 vector int accv = vec_splats(0);
119
120 for (int j = 0; j < 8; j++) {
121 const vector float v = vec_round(vec_mul(srcv[j], vid));
122 vi[j] = vec_cts(v, 0);
123
124 accv = vec_add(accv, vi[j]);
125 }
126 vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
127 vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
128
129 accv = vec_add(accv, vec_sld(accv, accv, 4));
130 accv = vec_add(accv, vec_sld(accv, accv, 8));
131 y[i].s = GGML_CPU_FP32_TO_FP16(d * vec_extract(accv, 0));
132 }
133
134#else
135 GGML_UNUSED(nb);
136 // scalar
137 quantize_row_q8_1_ref(x, y, k);
138#endif
139}
140
141
142//===================================== Dot products =================================
143
144void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
145 const int qk = QK8_0;
146 const int nb = n / qk;
147
148 assert(n % qk == 0);
149 assert(nrc == 1);
150 UNUSED(nrc);
151 UNUSED(bx);
152 UNUSED(by);
153 UNUSED(bs);
154
155 const block_q4_0 * GGML_RESTRICT x = vx;
156 const block_q8_0 * GGML_RESTRICT y = vy;
157
158 int ib = 0;
159 float sumf = 0;
160
161#if defined(__POWER9_VECTOR__)
162 const vector signed char lowMask = vec_splats((signed char)0xF);
163 const vector signed int v0 = vec_splats((int32_t)0);
164 const vector unsigned char v4 = vec_splats((unsigned char)0x4);
165 const vector signed char v8 = vec_splats((signed char)0x8);
166
167 vector float vsumf0 = vec_splats(0.0f);
168
169#pragma GCC unroll 8
170 for (; ib < nb; ++ib) {
171 __builtin_prefetch(x[ib].qs, 0, 1);
172 __builtin_prefetch(y[ib].qs, 0, 1);
173
174 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
175 vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
176 vector float vd = vec_mul(vxd, vyd);
177
178 vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
179 vector signed char q8y0 = vec_xl( 0, y[ib].qs);
180 vector signed char q8y1 = vec_xl(16, y[ib].qs);
181
182 vector signed char q4x0 = vec_and(qxs, lowMask);
183 vector signed char q4x1 = vec_sr(qxs, v4);
184
185 q4x0 = vec_sub(q4x0, v8);
186 q4x1 = vec_sub(q4x1, v8);
187
188 vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
189 vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
190
191 vector signed int vsumi0 = v0;
192
193 vsumi0 = vec_sum4s(qv0, vsumi0);
194 vsumi0 = vec_sum4s(qv1, vsumi0);
195
196 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
197 }
198
199 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
200 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
201
202 sumf = vec_extract(vsumf0, 0);
203
204 *s = sumf;
205#else
206 UNUSED(x);
207 UNUSED(y);
208 UNUSED(ib);
209 UNUSED(sumf);
210 ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
211#endif
212}
213
214void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
215 const int qk = QK8_1;
216 const int nb = n / qk;
217
218 assert(n % qk == 0);
219 assert(nrc == 1);
220 UNUSED(nrc);
221 UNUSED(bx);
222 UNUSED(by);
223 UNUSED(bs);
224
225 const block_q4_1 * GGML_RESTRICT x = vx;
226 const block_q8_1 * GGML_RESTRICT y = vy;
227
228 int ib = 0;
229 float sumf = 0;
230
231#if defined(__POWER9_VECTOR__)
232 const vector signed char lowMask = vec_splats((signed char)0xF);
233 const vector signed int v0 = vec_splats((int32_t)0);
234 const vector unsigned char v4 = vec_splats((unsigned char)0x4);
235
236 vector float vsumf0 = vec_splats(0.0f);
237
238#pragma GCC unroll 4
239 for (; ib < nb; ++ib) {
240 __builtin_prefetch(x[ib].qs, 0, 1);
241 __builtin_prefetch(y[ib].qs, 0, 1);
242
243 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
244 vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
245 vector float vd = vec_mul(vxd, vyd);
246
247 vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
248 vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
249 vsumf0 = vec_madd(vxmin, vys, vsumf0);
250
251 vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
252 vector signed char q8y0 = vec_xl( 0, y[ib].qs);
253 vector signed char q8y1 = vec_xl(16, y[ib].qs);
254
255 vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
256 vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
257
258 vector signed int vsumi0 = v0;
259
260 vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
261 vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
262
263 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
264 }
265
266 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
267 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
268
269 sumf = vec_extract(vsumf0, 0);
270
271 *s = sumf;
272#else
273 UNUSED(x);
274 UNUSED(y);
275 UNUSED(ib);
276 UNUSED(sumf);
277 ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
278#endif
279}
280
281void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
282 assert(nrc == 1);
283 UNUSED(nrc);
284 UNUSED(bx);
285 UNUSED(by);
286 UNUSED(bs);
287 assert(n % QK_MXFP4 == 0);
288 static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
289
290 const block_mxfp4 * GGML_RESTRICT x = vx;
291 const block_q8_0 * GGML_RESTRICT y = vy;
292
293 const int nb = n / QK_MXFP4;
294
295 int ib = 0;
296 float sumf = 0;
297
298#if defined(__POWER9_VECTOR__)
299 const vector signed char lowMask = vec_splats((signed char)0xF);
300 const vector unsigned char vshift4 = vec_splats((unsigned char)4);
301 vector float vsumf0 = vec_splats(0.0f);
302
303 vector signed char kv = vec_xl(0, (const signed char *)kvalues_mxfp4);
304
305#pragma GCC unroll 8
306 for (; ib < nb; ++ib) {
307 __builtin_prefetch(x[ib].qs, 0, 1);
308 __builtin_prefetch(y[ib].qs, 0, 1);
309
310 vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d) *
311 GGML_E8M0_TO_FP32_HALF(x[ib].e));
312
313 vector signed char q8y0 = vec_xl( 0, y[ib].qs);
314 vector signed char q8y1 = vec_xl(16, y[ib].qs);
315
316 vector signed char qxs = (vector signed char)vec_xl(0, x[ib].qs);
317
318 vector unsigned char lo_nibbles = (vector unsigned char)vec_and(qxs, lowMask);
319 vector unsigned char hi_nibbles = (vector unsigned char)vec_sr(qxs, vshift4);
320
321 vector signed char q4x0 = vec_perm(kv, kv, lo_nibbles);
322 vector signed char q4x1 = vec_perm(kv, kv, hi_nibbles);
323
324 vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
325 vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
326
327 vector signed int vsumi0 = vec_splats((int32_t)0);
328 vsumi0 = vec_sum4s(qv0, vsumi0);
329 vsumi0 = vec_sum4s(qv1, vsumi0);
330
331 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vyd, vsumf0);
332 }
333
334 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
335 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
336 sumf = vec_extract(vsumf0, 0);
337 *s = sumf;
338#else
339 UNUSED(x);
340 UNUSED(y);
341 UNUSED(ib);
342 UNUSED(sumf);
343 ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
344#endif
345}
346
347void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
348 const int qk = QK8_0;
349 const int nb = n / qk;
350
351 int ib = 0;
352 float sumf = 0;
353
354 assert(n % qk == 0);
355 assert(qk == QK5_0);
356 assert(nrc == 1);
357 UNUSED(nrc);
358 UNUSED(bx);
359 UNUSED(by);
360 UNUSED(bs);
361
362 const block_q5_0 * GGML_RESTRICT x = vx;
363 const block_q8_0 * GGML_RESTRICT y = vy;
364
365#if defined(__POWER9_VECTOR__)
366 const vector signed char lowMask = vec_splats((signed char)0xF);
367 const vector unsigned char v4 = vec_splats((unsigned char)4);
368
369 vector float vsumf0 = vec_splats(0.0f);
370
371#pragma GCC unroll 4
372 for (; ib < nb; ++ib) {
373 __builtin_prefetch(x[ib].qs, 0, 1);
374 __builtin_prefetch(y[ib].qs, 0, 1);
375
376 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
377 vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
378 vector float vd = vec_mul(vxd, vyd);
379
380 vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])};
381 vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[ib].qh[2]]), (uint64_t)(table_b2b_1[x[ib].qh[3]])};
382
383 vector signed char qh0 = (vector signed char)aux64x2_0;
384 vector signed char qh1 = (vector signed char)aux64x2_1;
385
386 vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
387
388 vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0);
389 vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1);
390
391 vector signed char q8y0 = vec_xl( 0, y[ib].qs);
392 vector signed char q8y1 = vec_xl( 16, y[ib].qs);
393
394 vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
395 vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
396
397 qv0 = vec_add(qv0, qv1);
398
399 vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
400
401 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
402 }
403
404 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
405 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
406
407 sumf = vec_extract(vsumf0, 0);
408
409 *s = sumf;
410#else
411 UNUSED(ib);
412 UNUSED(sumf);
413 UNUSED(x);
414 UNUSED(y);
415 ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
416#endif
417}
418
419void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
420 const int qk = QK8_1;
421 const int nb = n / qk;
422
423 int ib = 0;
424 float sumf = 0;
425
426 assert(n % qk == 0);
427 assert(qk == QK5_1);
428 assert(nrc == 1);
429 UNUSED(nrc);
430 UNUSED(bx);
431 UNUSED(by);
432 UNUSED(bs);
433
434 const block_q5_1 * GGML_RESTRICT x = vx;
435 const block_q8_1 * GGML_RESTRICT y = vy;
436
437#if defined(__POWER9_VECTOR__)
438 const vector signed char lowMask = vec_splats((signed char)0xF);
439 const vector signed int v0 = vec_splats((int32_t)0);
440 const vector unsigned char v4 = vec_splats((unsigned char)0x4);
441
442 vector float vsumf0 = vec_splats(0.0f);
443
444#pragma GCC unroll 4
445 for (; ib < nb; ++ib) {
446 __builtin_prefetch(x[ib].qs, 0, 1);
447 __builtin_prefetch(y[ib].qs, 0, 1);
448
449 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
450 vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
451 vector float vd = vec_mul(vxd, vyd);
452
453 vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
454 vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
455 vsumf0 = vec_madd(vxmin, vys, vsumf0);
456
457 vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])};
458 vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[ib].qh[2]]), (uint64_t)(table_b2b_0[x[ib].qh[3]])};
459
460 vector signed char qh0 = (vector signed char)aux64x2_0;
461 vector signed char qh1 = (vector signed char)aux64x2_1;
462
463 vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
464
465 vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
466 vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
467
468 vector signed char q8y0 = vec_xl( 0, y[ib].qs);
469 vector signed char q8y1 = vec_xl( 16, y[ib].qs);
470
471 vector signed int vsumi0 = v0;
472
473 vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
474 vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
475
476 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
477 }
478
479 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
480 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
481
482 sumf = vec_extract(vsumf0, 0);
483
484 *s = sumf;
485#else
486 UNUSED(nb);
487 UNUSED(ib);
488 UNUSED(sumf);
489 UNUSED(x);
490 UNUSED(y);
491 ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
492#endif
493}
494
495void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
496 const int qk = QK8_0;
497 const int nb = n / qk;
498
499 assert(n % qk == 0);
500 assert(nrc == 1);
501 UNUSED(nrc);
502 UNUSED(bx);
503 UNUSED(by);
504 UNUSED(bs);
505
506 const block_q8_0 * GGML_RESTRICT x = vx;
507 const block_q8_0 * GGML_RESTRICT y = vy;
508
509 int ib = 0;
510 float sumf = 0;
511
512#if defined(__POWER9_VECTOR__)
513 const vector signed int v0 = vec_splats((int32_t)0);
514 vector float vsumf0 = vec_splats(0.0f);
515
516#pragma GCC unroll 8
517 for (; ib < nb; ++ib) {
518 __builtin_prefetch(x[ib].qs, 0, 1);
519 __builtin_prefetch(y[ib].qs, 0, 1);
520
521 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
522 vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
523 vector float vd = vec_mul(vxd, vyd);
524
525 vector signed char q8x0 = vec_xl( 0, x[ib].qs);
526 vector signed char q8x1 = vec_xl(16, x[ib].qs);
527 vector signed char q8y0 = vec_xl( 0, y[ib].qs);
528 vector signed char q8y1 = vec_xl(16, y[ib].qs);
529
530 vector signed short qv0 = vec_mule(q8x0, q8y0);
531 vector signed short qv1 = vec_mulo(q8x0, q8y0);
532 vector signed short qv2 = vec_mule(q8x1, q8y1);
533 vector signed short qv3 = vec_mulo(q8x1, q8y1);
534
535 vector signed int vsumi0 = v0;
536 vector signed int vsumi1 = v0;
537
538 vsumi0 = vec_sum4s(qv0, vsumi0);
539 vsumi1 = vec_sum4s(qv1, vsumi1);
540 vsumi0 = vec_sum4s(qv2, vsumi0);
541 vsumi1 = vec_sum4s(qv3, vsumi1);
542
543 vsumi0 = vec_add(vsumi0, vsumi1);
544
545 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
546 }
547
548 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
549 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
550
551 sumf = vec_extract(vsumf0, 0);
552
553 *s = sumf;
554#else
555 UNUSED(nb);
556 UNUSED(x);
557 UNUSED(y);
558 UNUSED(ib);
559 UNUSED(sumf);
560 ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
561#endif
562}
563
564void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
565 assert(nrc == 1);
566 UNUSED(nrc);
567 UNUSED(bx);
568 UNUSED(by);
569 UNUSED(bs);
570
571 const block_q2_K * GGML_RESTRICT x = vx;
572 const block_q8_K * GGML_RESTRICT y = vy;
573
574 const int nb = n / QK_K;
575
576#if defined(__POWER9_VECTOR__)
577 const vector signed char lowMask = vec_splats((signed char)0x3);
578 const vector signed char lowScaleMask = vec_splats((signed char)0xF);
579 const vector int v0 = vec_splats((int32_t)0);
580 const vector unsigned char v2 = vec_splats((unsigned char)0x2);
581 const vector unsigned char v6 = vec_splats((unsigned char)0x6);
582 const vector unsigned char v4 = vec_splats((unsigned char)0x4);
583
584 vector float vsumf0 = vec_splats(0.0f);
585 vector float vsumf1 = vec_splats(0.0f);
586 vector float vsumf2 = vec_splats(0.0f);
587 vector float vsumf3 = vec_splats(0.0f);
588
589 for (int i = 0; i < nb; ++i) {
590 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
591 vector float vyd = vec_splats(y[i].d);
592 vector float vd = vec_mul(vxd, vyd);
593
594 vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
595 vector float vdmin = vec_mul(vxmin, vyd);
596
597 vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
598 vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
599
600 vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales);
601 vector signed char vscales = vec_and(q2xmins, lowScaleMask);
602
603 q2xmins = vec_sr(q2xmins, v4);
604 vector signed short q2xmins0 = vec_unpackh(q2xmins);
605 vector signed short q2xmins1 = vec_unpackl(q2xmins);
606
607 vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
608 vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
609 vector signed int prod2 = vec_mule(q2xmins1, q8ysums1);
610 vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1);
611
612 vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
613 vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
614 vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
615 vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
616
617 vector signed int vsumi0 = v0;
618 vector signed int vsumi1 = v0;
619 vector signed int vsumi2 = v0;
620 vector signed int vsumi3 = v0;
621 vector signed int vsumi4 = v0;
622 vector signed int vsumi5 = v0;
623 vector signed int vsumi6 = v0;
624 vector signed int vsumi7 = v0;
625
626 const uint8_t * GGML_RESTRICT q2 = x[i].qs;
627 const int8_t * GGML_RESTRICT q8 = y[i].qs;
628
629 for (int j = 0; j < QK_K/128; ++j) {
630 __builtin_prefetch(q2, 0, 1);
631 __builtin_prefetch(q8, 0, 1);
632
633 vector signed char qxs0 = (vector signed char)vec_xl( 0, q2);
634 vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
635 q2 += 32;
636
637 vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
638 vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
639 vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
640 vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
641 vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
642 vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
643 vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
644 vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
645
646 vector signed char q8y00 = vec_xl( 0, q8);
647 vector signed char q8y10 = vec_xl( 16, q8);
648 vector signed char q8y01 = vec_xl( 32, q8);
649 vector signed char q8y11 = vec_xl( 48, q8);
650 vector signed char q8y02 = vec_xl( 64, q8);
651 vector signed char q8y12 = vec_xl( 80, q8);
652 vector signed char q8y03 = vec_xl( 96, q8);
653 vector signed char q8y13 = vec_xl(112, q8);
654 q8 += 128;
655
656 vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
657 vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
658 vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
659 vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
660 vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
661 vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
662 vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
663 vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
664
665 vector signed short vscales_07 = vec_unpackh(vscales);
666 vector signed int vscales_03 = vec_unpackh(vscales_07);
667 vector signed int vscales_47 = vec_unpackl(vscales_07);
668 vector signed int vs0 = vec_splat(vscales_03, 0);
669 vector signed int vs1 = vec_splat(vscales_03, 1);
670 vector signed int vs2 = vec_splat(vscales_03, 2);
671 vector signed int vs3 = vec_splat(vscales_03, 3);
672 vector signed int vs4 = vec_splat(vscales_47, 0);
673 vector signed int vs5 = vec_splat(vscales_47, 1);
674 vector signed int vs6 = vec_splat(vscales_47, 2);
675 vector signed int vs7 = vec_splat(vscales_47, 3);
676 vscales = vec_sld(vscales, vscales, 8);
677
678 vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
679 vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
680 vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
681 vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
682 vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
683 vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
684 vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
685 vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
686 }
687
688 vsumi0 = vec_add(vsumi0, vsumi4);
689 vsumi1 = vec_add(vsumi1, vsumi5);
690 vsumi2 = vec_add(vsumi2, vsumi6);
691 vsumi3 = vec_add(vsumi3, vsumi7);
692
693 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
694 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
695 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
696 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
697 }
698
699 vsumf0 = vec_add(vsumf0, vsumf2);
700 vsumf1 = vec_add(vsumf1, vsumf3);
701
702 vsumf0 = vec_add(vsumf0, vsumf1);
703
704 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
705 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
706
707 *s = vec_extract(vsumf0, 0);
708
709#else
710 UNUSED(x);
711 UNUSED(y);
712 UNUSED(nb);
713 ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
714#endif
715}
716
717void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
718 assert(n % QK_K == 0);
719 assert(nrc == 1);
720 UNUSED(nrc);
721 UNUSED(bx);
722 UNUSED(by);
723 UNUSED(bs);
724
725 const uint32_t kmask1 = 0x03030303;
726 const uint32_t kmask2 = 0x0f0f0f0f;
727
728 const block_q3_K * GGML_RESTRICT x = vx;
729 const block_q8_K * GGML_RESTRICT y = vy;
730
731 const int nb = n / QK_K;
732
733#if defined(__POWER9_VECTOR__)
734 const vector signed char lowMask = vec_splats((signed char)0x3);
735 const vector signed char lowMask1 = vec_splats((int8_t)0xf);
736 const vector signed char lowMask2 = vec_splats((int8_t)0x30);
737 const vector int v0 = vec_splats((int32_t)0);
738 const vector signed char v1 = vec_splats((signed char)0x1);
739 const vector unsigned char v2 = vec_splats((unsigned char)0x2);
740 const vector unsigned char v3 = vec_splats((unsigned char)0x3);
741 const vector unsigned char v4 = vec_splats((unsigned char)0x4);
742 const vector unsigned char v6 = vec_splats((unsigned char)0x6);
743 const vector signed char off = vec_splats((signed char)0x20);
744
745 vector float vsumf0 = vec_splats(0.0f);
746 vector float vsumf1 = vec_splats(0.0f);
747 vector float vsumf2 = vec_splats(0.0f);
748 vector float vsumf3 = vec_splats(0.0f);
749
750 for (int i = 0; i < nb; ++i) {
751 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
752 vector float vyd = vec_splats(y[i].d);
753 vector float vd = vec_mul(vxd, vyd);
754
755 UNUSED(kmask1);
756 UNUSED(kmask2);
757
758 vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
759 vector signed char u1 = vec_and(u0, lowMask1);
760 vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
761 vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
762 vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
763 vector signed char u31 = vec_and(u3, lowMask2);
764
765 u1 = vec_or(u1, u30);
766 u2 = vec_or(vec_sr(u0, v4), u31);
767
768 vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
769 vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
770 vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
771
772 vscales = vec_sub(vscales, off);
773
774 vector signed int vsumi0 = v0;
775 vector signed int vsumi1 = v0;
776 vector signed int vsumi2 = v0;
777 vector signed int vsumi3 = v0;
778 vector signed int vsumi4 = v0;
779 vector signed int vsumi5 = v0;
780 vector signed int vsumi6 = v0;
781 vector signed int vsumi7 = v0;
782
783 const uint8_t * GGML_RESTRICT q3 = x[i].qs;
784 const int8_t * GGML_RESTRICT q8 = y[i].qs;
785
786 for (int j = 0; j < QK_K/128; ++j) {
787 __builtin_prefetch(q3, 0, 1);
788 __builtin_prefetch(q8, 0, 1);
789
790 vector signed char qxs0 = (vector signed char)vec_xl( 0, q3);
791 vector signed char qxs1 = (vector signed char)vec_xl(16, q3);
792 q3 += 32;
793
794 //the low 2 bits
795 vector signed char qxs00 = vec_and(qxs0, lowMask);
796 vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
797 vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask);
798 vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask);
799 vector signed char qxs10 = vec_and(qxs1, lowMask);
800 vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask);
801 vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask);
802 vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask);
803
804 //the 3rd bit
805 vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
806 vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2);
807 vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
808 vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2);
809 vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2);
810 vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2);
811 vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2);
812 vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2);
813 qxhs0 = vec_sr(qxhs0, v4);
814 qxhs1 = vec_sr(qxhs1, v4);
815
816 vector signed char q3x00 = vec_sub(qxs00, qxh00);
817 vector signed char q3x01 = vec_sub(qxs01, qxh01);
818 vector signed char q3x02 = vec_sub(qxs02, qxh02);
819 vector signed char q3x03 = vec_sub(qxs03, qxh03);
820 vector signed char q3x10 = vec_sub(qxs10, qxh10);
821 vector signed char q3x11 = vec_sub(qxs11, qxh11);
822 vector signed char q3x12 = vec_sub(qxs12, qxh12);
823 vector signed char q3x13 = vec_sub(qxs13, qxh13);
824
825 vector signed char q8y00 = vec_xl( 0, q8);
826 vector signed char q8y10 = vec_xl( 16, q8);
827 vector signed char q8y01 = vec_xl( 32, q8);
828 vector signed char q8y11 = vec_xl( 48, q8);
829 vector signed char q8y02 = vec_xl( 64, q8);
830 vector signed char q8y12 = vec_xl( 80, q8);
831 vector signed char q8y03 = vec_xl( 96, q8);
832 vector signed char q8y13 = vec_xl(112, q8);
833 q8 += 128;
834
835 vector signed short vscales_h = vec_unpackh(vscales);
836 vector signed short vs0 = vec_splat(vscales_h, 0);
837 vector signed short vs1 = vec_splat(vscales_h, 1);
838 vector signed short vs2 = vec_splat(vscales_h, 2);
839 vector signed short vs3 = vec_splat(vscales_h, 3);
840 vector signed short vs4 = vec_splat(vscales_h, 4);
841 vector signed short vs5 = vec_splat(vscales_h, 5);
842 vector signed short vs6 = vec_splat(vscales_h, 6);
843 vector signed short vs7 = vec_splat(vscales_h, 7);
844 vscales = vec_sld(vscales, vscales, 8);
845
846 vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
847 vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
848 vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02));
849 vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03));
850 vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
851 vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
852 vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
853 vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
854
855 vsumi0 = vec_msum(qv00, vs0, vsumi0);
856 vsumi1 = vec_msum(qv01, vs2, vsumi1);
857 vsumi2 = vec_msum(qv02, vs4, vsumi2);
858 vsumi3 = vec_msum(qv03, vs6, vsumi3);
859 vsumi4 = vec_msum(qv10, vs1, vsumi4);
860 vsumi5 = vec_msum(qv11, vs3, vsumi5);
861 vsumi6 = vec_msum(qv12, vs5, vsumi6);
862 vsumi7 = vec_msum(qv13, vs7, vsumi7);
863 }
864
865 vsumi0 = vec_add(vsumi0, vsumi4);
866 vsumi1 = vec_add(vsumi1, vsumi5);
867 vsumi2 = vec_add(vsumi2, vsumi6);
868 vsumi3 = vec_add(vsumi3, vsumi7);
869
870 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
871 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
872 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
873 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
874 }
875
876 vsumf0 = vec_add(vsumf0, vsumf2);
877 vsumf1 = vec_add(vsumf1, vsumf3);
878
879 vsumf0 = vec_add(vsumf0, vsumf1);
880
881 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
882 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
883
884 *s = vec_extract(vsumf0, 0);
885
886#else
887 UNUSED(kmask1);
888 UNUSED(kmask2);
889 UNUSED(x);
890 UNUSED(y);
891 UNUSED(nb);
892 ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
893#endif
894}
895
896void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
897 assert(n % QK_K == 0);
898 assert(nrc == 1);
899 UNUSED(nrc);
900 UNUSED(bx);
901 UNUSED(by);
902 UNUSED(bs);
903
904 const block_q4_K * GGML_RESTRICT x = vx;
905 const block_q8_K * GGML_RESTRICT y = vy;
906
907 const int nb = n / QK_K;
908
909 static const uint32_t kmask1 = 0x3f3f3f3f;
910 static const uint32_t kmask2 = 0x0f0f0f0f;
911 static const uint32_t kmask3 = 0x03030303;
912
913 uint32_t utmp[4];
914
915#if defined(__POWER9_VECTOR__)
916 const vector signed char lowMask = vec_splats((signed char)0xF);
917 const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
918 const vector signed char lowMask2 = vec_splats((int8_t)0x30);
919 const vector int v0 = vec_splats((int32_t)0);
920 const vector unsigned char v2 = vec_splats((uint8_t)2);
921 const vector unsigned char v4 = vec_splats((unsigned char)0x4);
922
923 vector float vsumf0 = vec_splats(0.0f);
924 vector float vsumf1 = vec_splats(0.0f);
925 vector float vsumf2 = vec_splats(0.0f);
926 vector float vsumf3 = vec_splats(0.0f);
927
928 for (int i = 0; i < nb; ++i) {
929 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
930 vector float vyd = vec_splats(y[i].d);
931 vector float vd = vec_mul(vxd, vyd);
932
933 vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
934 vector float vdmin = vec_mul(vxmin, vyd);
935
936 vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
937 vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
938
939 UNUSED(kmask1);
940 UNUSED(kmask2);
941 UNUSED(kmask3);
942 UNUSED(utmp);
943
944 vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
945 vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
946 vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
947 vector signed char u3 = vec_sr(u2, v4);
948
949 vector signed char u30 = u1;
950 vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
951
952 u1 = vec_and(u0, lowMask1);
953 u2 = vec_or(u30, u31);
954
955 vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
956
957 vector signed short vscales = vec_unpackh(utmps);
958 vector signed short q4xmins = vec_unpackl(utmps);
959 vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
960 vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins);
961
962 vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
963 vector signed int prod1 = vec_mule(q4xmins1, q8ysums1);
964 vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0);
965 vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1);
966
967 vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
968 vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
969 vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
970 vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
971
972 vector signed int vsumi0 = v0;
973 vector signed int vsumi1 = v0;
974 vector signed int vsumi2 = v0;
975 vector signed int vsumi3 = v0;
976
977 const uint8_t * GGML_RESTRICT q4 = x[i].qs;
978 const int8_t * GGML_RESTRICT q8 = y[i].qs;
979
980 for (int j = 0; j < QK_K/64; j+=2) {
981 __builtin_prefetch(q4, 0, 1);
982 __builtin_prefetch(q8, 0, 1);
983
984 vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
985 vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
986 vector signed char qxs2 = (vector signed char)vec_xl(32, q4);
987 vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
988 q4 += 64;
989
990 vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
991 vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
992 vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
993 vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
994 vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
995 vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
996 vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
997 vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
998
999 vector signed char q8y00 = vec_xl( 0, q8);
1000 vector signed char q8y10 = vec_xl( 16, q8);
1001 vector signed char q8y01 = vec_xl( 32, q8);
1002 vector signed char q8y11 = vec_xl( 48, q8);
1003 vector signed char q8y20 = vec_xl( 64, q8);
1004 vector signed char q8y30 = vec_xl( 80, q8);
1005 vector signed char q8y21 = vec_xl( 96, q8);
1006 vector signed char q8y31 = vec_xl(112, q8);
1007 q8 += 128;
1008
1009 vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
1010 vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
1011 vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
1012 vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
1013 vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
1014 vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
1015 vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
1016 vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
1017
1018 vector signed int vscales_h = vec_unpackh(vscales);
1019 vector signed int vs0 = vec_splat(vscales_h, 0);
1020 vector signed int vs1 = vec_splat(vscales_h, 1);
1021 vector signed int vs2 = vec_splat(vscales_h, 2);
1022 vector signed int vs3 = vec_splat(vscales_h, 3);
1023 vscales = vec_sld(vscales, vscales, 8);
1024
1025 vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
1026 vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
1027 vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
1028 vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
1029
1030 vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
1031 vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
1032 vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
1033 vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
1034 }
1035
1036 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1037 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1038 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1039 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1040 }
1041
1042 vsumf0 = vec_add(vsumf0, vsumf2);
1043 vsumf1 = vec_add(vsumf1, vsumf3);
1044
1045 vsumf0 = vec_add(vsumf0, vsumf1);
1046
1047 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1048 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1049
1050 *s = vec_extract(vsumf0, 0);
1051
1052#else
1053 UNUSED(x);
1054 UNUSED(y);
1055 UNUSED(nb);
1056 UNUSED(kmask1);
1057 UNUSED(kmask2);
1058 UNUSED(kmask3);
1059 UNUSED(utmp);
1060 ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1061#endif
1062}
1063
1064void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1065 assert(n % QK_K == 0);
1066 assert(nrc == 1);
1067 UNUSED(nrc);
1068 UNUSED(bx);
1069 UNUSED(by);
1070 UNUSED(bs);
1071
1072 const block_q5_K * GGML_RESTRICT x = vx;
1073 const block_q8_K * GGML_RESTRICT y = vy;
1074
1075 const int nb = n / QK_K;
1076
1077 static const uint32_t kmask1 = 0x3f3f3f3f;
1078 static const uint32_t kmask2 = 0x0f0f0f0f;
1079 static const uint32_t kmask3 = 0x03030303;
1080
1081 uint32_t utmp[4];
1082
1083#if defined(__POWER9_VECTOR__)
1084 const vector signed char lowMask = vec_splats((signed char)0xF);
1085 const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
1086 const vector signed char lowMask2 = vec_splats((int8_t)0x30);
1087 const vector int v0 = vec_splats((int32_t)0);
1088 const vector unsigned char v1 = vec_splats((unsigned char)0x1);
1089 const vector unsigned char v2 = vec_splats((unsigned char)0x2);
1090 const vector unsigned char v3 = vec_splats((unsigned char)0x3);
1091 const vector unsigned char v4 = vec_splats((unsigned char)0x4);
1092
1093 vector float vsumf0 = vec_splats(0.0f);
1094 vector float vsumf1 = vec_splats(0.0f);
1095 vector float vsumf2 = vec_splats(0.0f);
1096 vector float vsumf3 = vec_splats(0.0f);
1097
1098 for (int i = 0; i < nb; ++i) {
1099 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1100 vector float vyd = vec_splats(y[i].d);
1101 vector float vd = vec_mul(vxd, vyd);
1102
1103 vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
1104 vector float vdmin = vec_mul(vxmin, vyd);
1105
1106 UNUSED(kmask1);
1107 UNUSED(kmask2);
1108 UNUSED(kmask3);
1109 UNUSED(utmp);
1110
1111 vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
1112 vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
1113 vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
1114 vector signed char u3 = vec_sr(u2, v4);
1115
1116 vector signed char u30 = u1;
1117 vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
1118
1119 u1 = vec_and(u0, lowMask1);
1120 u2 = vec_or(u30, u31);
1121
1122 vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
1123
1124 vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
1125 vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
1126
1127 vector signed short vscales = vec_unpackh(utmps);
1128
1129 vector signed short q5xmins = vec_unpackl(utmps);
1130 vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins);
1131 vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins);
1132
1133 vector signed int prod0 = vec_mule(q5xmins0, q8ysums0);
1134 vector signed int prod1 = vec_mule(q5xmins1, q8ysums1);
1135 vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0);
1136 vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1);
1137
1138 vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
1139 vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
1140 vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
1141 vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
1142
1143 vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
1144 vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
1145
1146 vector signed int vsumi0 = v0;
1147 vector signed int vsumi1 = v0;
1148 vector signed int vsumi2 = v0;
1149 vector signed int vsumi3 = v0;
1150
1151 const uint8_t * GGML_RESTRICT q5 = x[i].qs;
1152 const int8_t * GGML_RESTRICT q8 = y[i].qs;
1153
1154 for (int j = 0; j < QK_K/64; ++j) {
1155 __builtin_prefetch(q5, 0, 1);
1156 __builtin_prefetch(q8, 0, 1);
1157
1158 vector signed char qxs0 = (vector signed char)vec_xl( 0, q5);
1159 vector signed char qxs1 = (vector signed char)vec_xl(16, q5);
1160 q5 += 32;
1161
1162 vector signed char qxs00 = vec_and(qxs0, lowMask);
1163 vector signed char qxs01 = vec_sr(qxs0, v4);
1164 vector signed char qxs10 = vec_and(qxs1, lowMask);
1165 vector signed char qxs11 = vec_sr(qxs1, v4);
1166
1167 vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4);
1168 vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3);
1169 vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4);
1170 vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3);
1171 qxhs0 = vec_sr(qxhs0, v2);
1172 qxhs1 = vec_sr(qxhs1, v2);
1173
1174 vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
1175 vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
1176 vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
1177 vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
1178
1179 vector signed char q8y00 = vec_xl( 0, q8);
1180 vector signed char q8y10 = vec_xl(16, q8);
1181 vector signed char q8y01 = vec_xl(32, q8);
1182 vector signed char q8y11 = vec_xl(48, q8);
1183 q8 += 64;
1184
1185 vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
1186 vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
1187 vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
1188 vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
1189
1190 vector signed int vscales_h = vec_unpackh(vscales);
1191 vector signed int vs0 = vec_splat(vscales_h, 0);
1192 vector signed int vs1 = vec_splat(vscales_h, 1);
1193 vscales = vec_sld(vscales, vscales, 12);
1194
1195 vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
1196 vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
1197 vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
1198 vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
1199 }
1200
1201 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1202 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1203 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1204 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1205 }
1206
1207 vsumf0 = vec_add(vsumf0, vsumf2);
1208 vsumf1 = vec_add(vsumf1, vsumf3);
1209
1210 vsumf0 = vec_add(vsumf0, vsumf1);
1211
1212 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1213 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1214
1215 *s = vec_extract(vsumf0, 0);
1216
1217#else
1218 UNUSED(x);
1219 UNUSED(y);
1220 UNUSED(nb);
1221 UNUSED(kmask1);
1222 UNUSED(kmask2);
1223 UNUSED(kmask3);
1224 UNUSED(utmp);
1225 ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1226#endif
1227}
1228
1229void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1230 assert(n % QK_K == 0);
1231 assert(nrc == 1);
1232 UNUSED(nrc);
1233 UNUSED(bx);
1234 UNUSED(by);
1235 UNUSED(bs);
1236
1237 const block_q6_K * GGML_RESTRICT x = vx;
1238 const block_q8_K * GGML_RESTRICT y = vy;
1239
1240 const int nb = n / QK_K;
1241
1242#if defined(__POWER9_VECTOR__)
1243 const vector signed char lowMask = vec_splats((signed char)0xF);
1244 const vector int v0 = vec_splats((int32_t)0);
1245 const vector unsigned char v2 = vec_splats((unsigned char)0x2);
1246 const vector unsigned char v3 = vec_splats((unsigned char)0x3);
1247 const vector unsigned char v4 = vec_splats((unsigned char)0x4);
1248 const vector unsigned char v6 = vec_splats((unsigned char)0x6);
1249 const vector signed char off = vec_splats((signed char)0x20);
1250
1251 vector float vsumf0 = vec_splats(0.0f);
1252 vector float vsumf1 = vec_splats(0.0f);
1253 vector float vsumf2 = vec_splats(0.0f);
1254 vector float vsumf3 = vec_splats(0.0f);
1255
1256 for (int i = 0; i < nb; ++i) {
1257 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1258 vector float vyd = vec_splats(y[i].d);
1259 vector float vd = vec_mul(vxd, vyd);
1260
1261 vector signed int vsumi0 = v0;
1262 vector signed int vsumi1 = v0;
1263 vector signed int vsumi2 = v0;
1264 vector signed int vsumi3 = v0;
1265 vector signed int vsumi4 = v0;
1266 vector signed int vsumi5 = v0;
1267 vector signed int vsumi6 = v0;
1268 vector signed int vsumi7 = v0;
1269
1270 const uint8_t * GGML_RESTRICT q6 = x[i].ql;
1271 const uint8_t * GGML_RESTRICT qh = x[i].qh;
1272 const int8_t * GGML_RESTRICT qs = x[i].scales;
1273 const int8_t * GGML_RESTRICT q8 = y[i].qs;
1274
1275 for (int j = 0; j < QK_K/128; ++j) {
1276 __builtin_prefetch(q6, 0, 0);
1277 __builtin_prefetch(qh, 0, 0);
1278 __builtin_prefetch(q8, 0, 0);
1279
1280 vector signed char qxs0 = (vector signed char)vec_xl( 0, q6);
1281 vector signed char qxs1 = (vector signed char)vec_xl(16, q6);
1282 vector signed char qxs2 = (vector signed char)vec_xl(32, q6);
1283 vector signed char qxs3 = (vector signed char)vec_xl(48, q6);
1284 q6 += 64;
1285
1286 vector signed char qxs00 = vec_and(qxs0, lowMask);
1287 vector signed char qxs01 = vec_sr(qxs0, v4);
1288 vector signed char qxs10 = vec_and(qxs1, lowMask);
1289 vector signed char qxs11 = vec_sr(qxs1, v4);
1290 vector signed char qxs20 = vec_and(qxs2, lowMask);
1291 vector signed char qxs21 = vec_sr(qxs2, v4);
1292 vector signed char qxs30 = vec_and(qxs3, lowMask);
1293 vector signed char qxs31 = vec_sr(qxs3, v4);
1294
1295 vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh);
1296 vector signed char qxhs1 = (vector signed char)vec_xl(16, qh);
1297 qh += 32;
1298
1299 vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
1300 vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
1301 vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4);
1302 vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4);
1303 vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
1304 vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
1305 vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4);
1306 vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4);
1307
1308 vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
1309 vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
1310 vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
1311 vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
1312 vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off);
1313 vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off);
1314 vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off);
1315 vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off);
1316
1317 vector signed char q8y00 = vec_xl( 0, q8);
1318 vector signed char q8y10 = vec_xl( 16, q8);
1319 vector signed char q8y20 = vec_xl( 32, q8);
1320 vector signed char q8y30 = vec_xl( 48, q8);
1321 vector signed char q8y01 = vec_xl( 64, q8);
1322 vector signed char q8y11 = vec_xl( 80, q8);
1323 vector signed char q8y21 = vec_xl( 96, q8);
1324 vector signed char q8y31 = vec_xl(112, q8);
1325 q8 += 128;
1326
1327 vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
1328 vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
1329 vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20));
1330 vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30));
1331 vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
1332 vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
1333 vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21));
1334 vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31));
1335
1336 vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8));
1337 qs += 8;
1338
1339 vector signed short vs0 = vec_splat(vscales, 0);
1340 vector signed short vs1 = vec_splat(vscales, 1);
1341 vector signed short vs2 = vec_splat(vscales, 2);
1342 vector signed short vs3 = vec_splat(vscales, 3);
1343 vector signed short vs4 = vec_splat(vscales, 4);
1344 vector signed short vs5 = vec_splat(vscales, 5);
1345 vector signed short vs6 = vec_splat(vscales, 6);
1346 vector signed short vs7 = vec_splat(vscales, 7);
1347
1348 vsumi0 = vec_msum(qv00, vs0, vsumi0);
1349 vsumi1 = vec_msum(qv01, vs4, vsumi1);
1350 vsumi2 = vec_msum(qv10, vs1, vsumi2);
1351 vsumi3 = vec_msum(qv11, vs5, vsumi3);
1352 vsumi4 = vec_msum(qv20, vs2, vsumi4);
1353 vsumi5 = vec_msum(qv21, vs6, vsumi5);
1354 vsumi6 = vec_msum(qv30, vs3, vsumi6);
1355 vsumi7 = vec_msum(qv31, vs7, vsumi7);
1356 }
1357
1358 vsumi0 = vec_add(vsumi0, vsumi4);
1359 vsumi1 = vec_add(vsumi1, vsumi5);
1360 vsumi2 = vec_add(vsumi2, vsumi6);
1361 vsumi3 = vec_add(vsumi3, vsumi7);
1362
1363 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1364 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1365 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1366 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1367 }
1368
1369 vsumf0 = vec_add(vsumf0, vsumf2);
1370 vsumf1 = vec_add(vsumf1, vsumf3);
1371
1372 vsumf0 = vec_add(vsumf0, vsumf1);
1373
1374 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1375 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1376
1377 *s = vec_extract(vsumf0, 0);
1378
1379#else
1380 UNUSED(x);
1381 UNUSED(y);
1382 UNUSED(nb);
1383 ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1384#endif
1385}
1386
1387#if defined (__POWER9_VECTOR__)
1388static const int8_t keven_signs_q2xs[1024] = {
1389 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
1390 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
1391 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
1392 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
1393 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
1394 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
1395 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
1396 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
1397 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
1398 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
1399 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
1400 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
1401 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
1402 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
1403 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
1404 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
1405 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
1406 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
1407 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
1408 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
1409 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
1410 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
1411 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
1412 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
1413 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
1414 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
1415 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
1416 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
1417 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
1418 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
1419 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
1420 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
1421};
1422#endif
1423
1424void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1425 assert(n % QK_K == 0);
1426 assert(nrc == 1);
1427 UNUSED(nrc);
1428 UNUSED(bx);
1429 UNUSED(by);
1430 UNUSED(bs);
1431
1432 const block_iq2_xxs * GGML_RESTRICT x = vx;
1433 const block_q8_K * GGML_RESTRICT y = vy;
1434
1435 const int nb = n / QK_K;
1436
1437#if defined(__POWER9_VECTOR__)
1438 const vector int v0 = vec_splats((int32_t)0);
1439 vector float vsumf0 = vec_splats(0.0f);
1440 vector float vsumf1 = vec_splats(0.0f);
1441 vector float vsumf2 = vec_splats(0.0f);
1442 vector float vsumf3 = vec_splats(0.0f);
1443
1444 const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1445
1446 for (int i = 0; i < nb; ++i) {
1447 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1448 vector float vyd = vec_splats(y[i].d);
1449 vector float vd = vec_mul(vxd, vyd);
1450
1451 vector signed int vsumi0 = v0;
1452 vector signed int vsumi1 = v0;
1453 vector signed int vsumi2 = v0;
1454 vector signed int vsumi3 = v0;
1455
1456 const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1457 const int8_t * GGML_RESTRICT q8 = y[i].qs;
1458
1459 for (int j = 0; j < QK_K/32; j += 2) {
1460 __builtin_prefetch(q2, 0, 1);
1461 __builtin_prefetch(q8, 0, 1);
1462
1463 uint32_t aux32[4];
1464 const uint8_t * aux8 = (const uint8_t *)aux32;
1465
1466 memcpy(aux32, q2, 4*sizeof(uint32_t));
1467 q2 += 8;
1468
1469 vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])};
1470 vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])};
1471 vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])};
1472 vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])};
1473
1474 vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127))};
1475 vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))};
1476 vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127))};
1477 vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))};
1478
1479 vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
1480 vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
1481 vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
1482 vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
1483
1484 vector signed char q8y0 = vec_xl( 0, q8);
1485 vector signed char q8y1 = vec_xl(16, q8);
1486 vector signed char q8y2 = vec_xl(32, q8);
1487 vector signed char q8y3 = vec_xl(48, q8);
1488 q8 += 64;
1489
1490 vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
1491 vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
1492 vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
1493 vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
1494
1495 const uint16_t ls0 = aux32[1] >> 28;
1496 const uint16_t ls1 = aux32[3] >> 28;
1497
1498 vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
1499 vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
1500
1501 vsumi0 = vec_msum(qv0, vscales01, vsumi0);
1502 vsumi1 = vec_msum(qv1, vscales01, vsumi1);
1503 vsumi2 = vec_msum(qv2, vscales23, vsumi2);
1504 vsumi3 = vec_msum(qv3, vscales23, vsumi3);
1505 }
1506
1507 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1508 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1509 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1510 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1511 }
1512
1513 vsumf0 = vec_add(vsumf0, vsumf2);
1514 vsumf1 = vec_add(vsumf1, vsumf3);
1515
1516 vsumf0 = vec_add(vsumf0, vsumf1);
1517
1518 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1519 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1520
1521 *s = 0.125f * vec_extract(vsumf0, 0);
1522
1523#else
1524 UNUSED(x);
1525 UNUSED(y);
1526 UNUSED(nb);
1527 ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1528#endif
1529}
1530
1531void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1532 assert(n % QK_K == 0);
1533 assert(nrc == 1);
1534 UNUSED(nrc);
1535 UNUSED(bx);
1536 UNUSED(by);
1537 UNUSED(bs);
1538
1539 const block_iq2_xs * GGML_RESTRICT x = vx;
1540 const block_q8_K * GGML_RESTRICT y = vy;
1541
1542 const int nb = n / QK_K;
1543
1544#if defined(__POWER9_VECTOR__)
1545 const vector int v0 = vec_splats((int32_t)0);
1546 vector float vsumf0 = vec_splats(0.0f);
1547 vector float vsumf1 = vec_splats(0.0f);
1548 vector float vsumf2 = vec_splats(0.0f);
1549 vector float vsumf3 = vec_splats(0.0f);
1550
1551 const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1552
1553 for (int i = 0; i < nb; ++i) {
1554 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1555 vector float vyd = vec_splats(y[i].d);
1556 vector float vd = vec_mul(vxd, vyd);
1557
1558 vector signed int vsumi0 = v0;
1559 vector signed int vsumi1 = v0;
1560 vector signed int vsumi2 = v0;
1561 vector signed int vsumi3 = v0;
1562
1563 const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1564 const uint8_t * GGML_RESTRICT sc = x[i].scales;
1565 const int8_t * GGML_RESTRICT q8 = y[i].qs;
1566
1567 for (int j = 0; j < QK_K/64; ++j) {
1568 __builtin_prefetch(q2, 0, 1);
1569 __builtin_prefetch(q8, 0, 1);
1570
1571 vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))};
1572 vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))};
1573 vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))};
1574 vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))};
1575
1576 vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))};
1577 vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))};
1578 vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))};
1579 vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))};
1580 q2 += 8;
1581
1582 vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
1583 vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
1584 vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
1585 vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
1586
1587 vector signed char q8y0 = vec_xl( 0, q8);
1588 vector signed char q8y1 = vec_xl(16, q8);
1589 vector signed char q8y2 = vec_xl(32, q8);
1590 vector signed char q8y3 = vec_xl(48, q8);
1591 q8 += 64;
1592
1593 vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
1594 vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
1595 vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
1596 vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
1597
1598 const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
1599 const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
1600 const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
1601 const uint16_t ls3 = (uint16_t)(sc[1] >> 4);
1602 sc += 2;
1603
1604 vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
1605 vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
1606 vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
1607 vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
1608
1609 vsumi0 = vec_msum(qv0, vscales0, vsumi0);
1610 vsumi1 = vec_msum(qv1, vscales1, vsumi1);
1611 vsumi2 = vec_msum(qv2, vscales2, vsumi2);
1612 vsumi3 = vec_msum(qv3, vscales3, vsumi3);
1613 }
1614
1615 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1616 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1617 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1618 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1619 }
1620
1621 vsumf0 = vec_add(vsumf0, vsumf2);
1622 vsumf1 = vec_add(vsumf1, vsumf3);
1623
1624 vsumf0 = vec_add(vsumf0, vsumf1);
1625
1626 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1627 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1628
1629 *s = 0.125f * vec_extract(vsumf0, 0);
1630
1631#else
1632 UNUSED(x);
1633 UNUSED(y);
1634 UNUSED(nb);
1635 ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1636#endif
1637}
1638
1639void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1640 assert(n % QK_K == 0);
1641 assert(nrc == 1);
1642 UNUSED(nrc);
1643 UNUSED(bx);
1644 UNUSED(by);
1645 UNUSED(bs);
1646
1647 const block_iq2_s * GGML_RESTRICT x = vx;
1648 const block_q8_K * GGML_RESTRICT y = vy;
1649
1650 const int nb = n / QK_K;
1651
1652#if defined(__POWER9_VECTOR__)
1653 static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
1654 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
1655 };
1656
1657 static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
1658
1659 const vector int v0 = vec_splats((int32_t)0);
1660
1661 vector float vsumf0 = vec_splats(0.0f);
1662 vector float vsumf1 = vec_splats(0.0f);
1663 vector float vsumf2 = vec_splats(0.0f);
1664 vector float vsumf3 = vec_splats(0.0f);
1665
1666 const vector unsigned char mask0 = vec_xl( 0, k_mask1);
1667 const vector unsigned char mask1 = vec_xl(16, k_mask1);
1668 const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
1669
1670 for (int i = 0; i < nb; ++i) {
1671 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1672 vector float vyd = vec_splats(y[i].d);
1673 vector float vd = vec_mul(vxd, vyd);
1674
1675 vector signed int vsumi0 = v0;
1676 vector signed int vsumi1 = v0;
1677 vector signed int vsumi2 = v0;
1678 vector signed int vsumi3 = v0;
1679
1680 const uint8_t * GGML_RESTRICT q2 = x[i].qs;
1681 const uint8_t * GGML_RESTRICT qh = x[i].qh;
1682 const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
1683 const uint8_t * GGML_RESTRICT sc = x[i].scales;
1684 const int8_t * GGML_RESTRICT q8 = y[i].qs;
1685
1686 for (int j = 0; j < QK_K/32; j += 2) {
1687 __builtin_prefetch(q2, 0, 1);
1688 __builtin_prefetch(q8, 0, 1);
1689
1690 vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))};
1691 vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))};
1692 vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))};
1693 vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))};
1694 q2 += 8;
1695 qh += 2;
1696
1697 vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
1698 vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
1699 signs += 4;
1700
1701 vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
1702 vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
1703 vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0);
1704 vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1);
1705
1706 vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
1707 vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
1708 vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
1709 vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
1710
1711 vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0);
1712 vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1);
1713 vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2);
1714 vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3);
1715
1716 vector signed char q8y0 = vec_xl( 0, q8);
1717 vector signed char q8y1 = vec_xl(16, q8);
1718 vector signed char q8y2 = vec_xl(32, q8);
1719 vector signed char q8y3 = vec_xl(48, q8);
1720 q8 += 64;
1721
1722 vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
1723 vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
1724 vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
1725 vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
1726
1727 const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
1728 const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
1729 const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
1730 const uint16_t ls3 = (uint16_t)(sc[1] >> 4);
1731 sc += 2;
1732
1733 vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
1734 vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
1735 vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
1736 vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
1737
1738 vsumi0 = vec_msum(qv0, vscales0, vsumi0);
1739 vsumi1 = vec_msum(qv1, vscales1, vsumi1);
1740 vsumi2 = vec_msum(qv2, vscales2, vsumi2);
1741 vsumi3 = vec_msum(qv3, vscales3, vsumi3);
1742 }
1743
1744 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1745 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1746 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1747 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1748 }
1749
1750 vsumf0 = vec_add(vsumf0, vsumf2);
1751 vsumf1 = vec_add(vsumf1, vsumf3);
1752
1753 vsumf0 = vec_add(vsumf0, vsumf1);
1754
1755 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1756 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1757
1758 *s = 0.125f * vec_extract(vsumf0, 0);
1759
1760#else
1761 UNUSED(x);
1762 UNUSED(y);
1763 UNUSED(nb);
1764 ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1765#endif
1766}
1767
1768void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1769 assert(n % QK_K == 0);
1770 assert(nrc == 1);
1771 UNUSED(nrc);
1772 UNUSED(bx);
1773 UNUSED(by);
1774 UNUSED(bs);
1775
1776 const block_iq3_xxs * GGML_RESTRICT x = vx;
1777 const block_q8_K * GGML_RESTRICT y = vy;
1778
1779 const int nb = n / QK_K;
1780
1781#if defined(__POWER9_VECTOR__)
1782 const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1783
1784 const vector int v0 = vec_splats((int32_t)0);
1785
1786 vector float vsumf0 = vec_splats(0.0f);
1787 vector float vsumf1 = vec_splats(0.0f);
1788 vector float vsumf2 = vec_splats(0.0f);
1789 vector float vsumf3 = vec_splats(0.0f);
1790
1791 for (int i = 0; i < nb; ++i) {
1792 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1793 vector float vyd = vec_splats(y[i].d);
1794 vector float vd = vec_mul(vxd, vyd);
1795
1796 vector signed int vsumi0 = v0;
1797 vector signed int vsumi1 = v0;
1798 vector signed int vsumi2 = v0;
1799 vector signed int vsumi3 = v0;
1800
1801 const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1802 const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
1803 const int8_t * GGML_RESTRICT q8 = y[i].qs;
1804
1805#pragma GCC unroll 1
1806 for (int j = 0; j < QK_K/32; j += 2) {
1807 __builtin_prefetch(q3, 0, 1);
1808 __builtin_prefetch(q8, 0, 1);
1809
1810 vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
1811 vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
1812 vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
1813 vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
1814 q3 += 16;
1815
1816 vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >> 0) & 127]), (uint64_t)(signs64[(signs[0] >> 7) & 127])};
1817 vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])};
1818 vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >> 0) & 127]), (uint64_t)(signs64[(signs[1] >> 7) & 127])};
1819 vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])};
1820
1821 vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0);
1822 vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1);
1823 vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2);
1824 vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3);
1825
1826 vector signed char q8y0 = vec_xl( 0, q8);
1827 vector signed char q8y1 = vec_xl(16, q8);
1828 vector signed char q8y2 = vec_xl(32, q8);
1829 vector signed char q8y3 = vec_xl(48, q8);
1830 q8 += 64;
1831
1832 vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
1833 vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
1834 vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
1835 vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
1836
1837 const uint16_t ls0 = (uint16_t)(signs[0] >> 28);
1838 const uint16_t ls1 = (uint16_t)(signs[1] >> 28);
1839 signs += 2;
1840
1841 vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
1842 vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
1843
1844 vsumi0 = vec_msum(qv0, vscales01, vsumi0);
1845 vsumi1 = vec_msum(qv1, vscales01, vsumi1);
1846 vsumi2 = vec_msum(qv2, vscales23, vsumi2);
1847 vsumi3 = vec_msum(qv3, vscales23, vsumi3);
1848 }
1849
1850 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1851 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1852 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1853 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1854 }
1855
1856 vsumf0 = vec_add(vsumf0, vsumf2);
1857 vsumf1 = vec_add(vsumf1, vsumf3);
1858
1859 vsumf0 = vec_add(vsumf0, vsumf1);
1860
1861 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1862 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1863
1864 *s = 0.25f * vec_extract(vsumf0, 0);
1865
1866#else
1867 UNUSED(x);
1868 UNUSED(y);
1869 UNUSED(nb);
1870 ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1871#endif
1872}
1873
1874void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1875 assert(n % QK_K == 0);
1876 assert(nrc == 1);
1877 UNUSED(nrc);
1878 UNUSED(bx);
1879 UNUSED(by);
1880 UNUSED(bs);
1881
1882 const block_iq3_s * GGML_RESTRICT x = vx;
1883 const block_q8_K * GGML_RESTRICT y = vy;
1884
1885 const int nb = n / QK_K;
1886
1887#if defined(__POWER9_VECTOR__)
1888 static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
1889 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
1890 };
1891
1892 static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
1893
1894 const vector int v0 = vec_splats((int32_t)0);
1895
1896 vector float vsumf0 = vec_splats(0.0f);
1897 vector float vsumf1 = vec_splats(0.0f);
1898 vector float vsumf2 = vec_splats(0.0f);
1899 vector float vsumf3 = vec_splats(0.0f);
1900
1901 const vector unsigned char mask0 = vec_xl( 0, k_mask1);
1902 const vector unsigned char mask1 = vec_xl(16, k_mask1);
1903 const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
1904
1905 for (int i = 0; i < nb; ++i) {
1906 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1907 vector float vyd = vec_splats(y[i].d);
1908 vector float vd = vec_mul(vxd, vyd);
1909
1910 const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1911 const uint8_t * GGML_RESTRICT qh = x[i].qh;
1912 const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
1913 const uint8_t * GGML_RESTRICT sc = x[i].scales;
1914 const int8_t * GGML_RESTRICT q8 = y[i].qs;
1915
1916 vector signed int vsumi0 = v0;
1917 vector signed int vsumi1 = v0;
1918 vector signed int vsumi2 = v0;
1919 vector signed int vsumi3 = v0;
1920
1921 for (int j = 0; j < QK_K/32; j += 2) {
1922 __builtin_prefetch(q3, 0, 1);
1923 __builtin_prefetch(q8, 0, 1);
1924
1925 vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)],
1926 iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]};
1927 vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)],
1928 iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]};
1929 vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)],
1930 iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]};
1931 vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)],
1932 iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]};
1933 q3 += 16;
1934 qh += 2;
1935
1936 vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
1937 vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
1938 signs += 4;
1939
1940 vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
1941 vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
1942 vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0);
1943 vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1);
1944
1945 vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
1946 vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
1947 vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
1948 vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
1949
1950 vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0);
1951 vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1);
1952 vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2);
1953 vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3);
1954
1955 vector signed char q8y0 = vec_xl( 0, q8);
1956 vector signed char q8y1 = vec_xl(16, q8);
1957 vector signed char q8y2 = vec_xl(32, q8);
1958 vector signed char q8y3 = vec_xl(48, q8);
1959 q8 += 64;
1960
1961 vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
1962 vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
1963 vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
1964 vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
1965
1966 const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
1967 const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
1968 sc ++;
1969
1970 vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
1971 vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
1972
1973 vsumi0 = vec_msum(qv0, vscales01, vsumi0);
1974 vsumi1 = vec_msum(qv1, vscales01, vsumi1);
1975 vsumi2 = vec_msum(qv2, vscales23, vsumi2);
1976 vsumi3 = vec_msum(qv3, vscales23, vsumi3);
1977 }
1978
1979 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1980 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1981 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1982 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1983 }
1984
1985 vsumf0 = vec_add(vsumf0, vsumf2);
1986 vsumf1 = vec_add(vsumf1, vsumf3);
1987
1988 vsumf0 = vec_add(vsumf0, vsumf1);
1989
1990 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1991 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1992
1993 *s = vec_extract(vsumf0, 0);
1994
1995#else
1996 UNUSED(x);
1997 UNUSED(y);
1998 UNUSED(nb);
1999 ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2000#endif
2001}
2002
2003void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2004 assert(n % QK_K == 0);
2005 assert(nrc == 1);
2006 UNUSED(nrc);
2007 UNUSED(bx);
2008 UNUSED(by);
2009 UNUSED(bs);
2010
2011 const block_iq1_s * GGML_RESTRICT x = vx;
2012 const block_q8_K * GGML_RESTRICT y = vy;
2013
2014 const int nb = n / QK_K;
2015
2016#if defined(__POWER9_VECTOR__)
2017 const vector unsigned char v0 = vec_splats((unsigned char)0x0);
2018 const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
2019
2020 vector float vsumf0 = vec_splats(0.0f);
2021 vector float vsumf1 = vec_splats(0.0f);
2022 vector float vsumf2 = vec_splats(0.0f);
2023 vector float vsumf3 = vec_splats(0.0f);
2024
2025 for (int i = 0; i < nb; ++i) {
2026 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
2027 vector float vyd = vec_splats(y[i].d);
2028 vector float vd = vec_mul(vxd, vyd);
2029
2030 vector signed int vsumi0 = vec_splats((int32_t)0);
2031 vector signed int vsumi1 = vec_splats((int32_t)0);
2032 vector signed int vsumi2 = vec_splats((int32_t)0);
2033 vector signed int vsumi3 = vec_splats((int32_t)0);
2034 vector signed int vsumi8 = vec_splats((int32_t)0);
2035
2036 const uint8_t * GGML_RESTRICT q1 = x[i].qs;
2037 const uint16_t * GGML_RESTRICT qh = x[i].qh;
2038 const int8_t * GGML_RESTRICT q8 = y[i].qs;
2039 const int16_t * GGML_RESTRICT qs = y[i].bsums;
2040
2041 for (int j = 0; j < QK_K/32; j += 2) {
2042 __builtin_prefetch(q1, 0, 1);
2043 __builtin_prefetch(qh, 0, 1);
2044 __builtin_prefetch(q8, 0, 1);
2045
2046 vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))};
2047 vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))};
2048 vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))};
2049 vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))};
2050 q1 += 8;
2051
2052 vector signed char q1x0 = (vector signed char)aux64x2_0;
2053 vector signed char q1x1 = (vector signed char)aux64x2_1;
2054 vector signed char q1x2 = (vector signed char)aux64x2_2;
2055 vector signed char q1x3 = (vector signed char)aux64x2_3;
2056
2057 vector signed char q8y0 = vec_xl( 0, q8);
2058 vector signed char q8y1 = vec_xl(16, q8);
2059 vector signed char q8y2 = vec_xl(32, q8);
2060 vector signed char q8y3 = vec_xl(48, q8);
2061 q8 += 64;
2062
2063 vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0));
2064 vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1));
2065 vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2));
2066 vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3));
2067
2068 const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7);
2069 const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7);
2070
2071 vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
2072 vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
2073 vector signed short vscales = vec_sld(vscales23, vscales01, 8);
2074
2075 vsumi0 = vec_msum(qv0, vscales01, vsumi0);
2076 vsumi1 = vec_msum(qv1, vscales01, vsumi1);
2077 vsumi2 = vec_msum(qv2, vscales23, vsumi2);
2078 vsumi3 = vec_msum(qv3, vscales23, vsumi3);
2079
2080 vector signed short q8ysums = vec_xl_len(qs, 8);
2081 qs += 4;
2082 q8ysums = vec_mergeh(q8ysums, (vector signed short)v0);
2083
2084 vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8);
2085 qh += 2;
2086 vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0);
2087
2088 vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel);
2089
2090 vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
2091 }
2092
2093 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
2094 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
2095 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
2096 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
2097
2098 vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0);
2099 }
2100
2101 vsumf0 = vec_add(vsumf0, vsumf2);
2102 vsumf1 = vec_add(vsumf1, vsumf3);
2103
2104 vsumf0 = vec_add(vsumf0, vsumf1);
2105
2106 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
2107 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
2108
2109 *s = vec_extract(vsumf0, 0);
2110
2111#else
2112 UNUSED(x);
2113 UNUSED(y);
2114 UNUSED(nb);
2115 ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2116#endif
2117}
2118
2119void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2120 assert(nrc == 1);
2121 UNUSED(nrc);
2122 UNUSED(bx);
2123 UNUSED(by);
2124 UNUSED(bs);
2125 assert(n % QK4_NL == 0);
2126 static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
2127
2128 const block_iq4_nl * GGML_RESTRICT x = vx;
2129 const block_q8_0 * GGML_RESTRICT y = vy;
2130
2131 const int nb = n / QK4_NL;
2132
2133 int ib = 0;
2134 float sumf = 0;
2135
2136#if defined(__POWER9_VECTOR__)
2137 const vector signed char lowMask = vec_splats((signed char)0xF);
2138 const vector signed int v0 = vec_splats((int32_t)0);
2139 const vector unsigned char v4 = vec_splats((unsigned char)0x4);
2140
2141 vector float vsumf0 = vec_splats(0.0f);
2142 vector float vsumf1 = vec_splats(0.0f);
2143
2144 const vector signed char values = vec_xl( 0, kvalues_iq4nl);
2145
2146#pragma GCC unroll 4
2147 for (; ib < nb; ++ib) {
2148 __builtin_prefetch(x[ib].qs, 0, 1);
2149 __builtin_prefetch(y[ib].qs, 0, 1);
2150
2151
2152 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
2153 vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
2154 vector float vd = vec_mul(vxd, vyd);
2155
2156 vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
2157 vector signed char q4x0 = vec_and(qxs, lowMask);
2158 vector signed char q4x1 = vec_sr(qxs, v4);
2159
2160 q4x0 = vec_perm(values, values, (vector unsigned char)q4x0);
2161 q4x1 = vec_perm(values, values, (vector unsigned char)q4x1);
2162
2163 vector signed char q8y0 = vec_xl( 0, y[ib].qs);
2164 vector signed char q8y1 = vec_xl(16, y[ib].qs);
2165
2166 vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
2167 vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
2168
2169 vector signed int vsumi0 = v0;
2170 vector signed int vsumi1 = v0;
2171
2172 vsumi0 = vec_sum4s(qv0, vsumi0);
2173 vsumi1 = vec_sum4s(qv1, vsumi1);
2174
2175 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
2176 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
2177 }
2178
2179 vsumf0 = vec_add(vsumf0, vsumf1);
2180
2181 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
2182 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
2183
2184 sumf = vec_extract(vsumf0, 0);
2185
2186 *s = sumf;
2187#else
2188 UNUSED(x);
2189 UNUSED(y);
2190 UNUSED(nb);
2191 UNUSED(ib);
2192 UNUSED(sumf);
2193 ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
2194#endif
2195}
2196
2197void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2198 assert(nrc == 1);
2199 UNUSED(nrc);
2200 UNUSED(bx);
2201 UNUSED(by);
2202 UNUSED(bs);
2203 assert(n % QK_K == 0);
2204
2205 const block_iq4_xs * GGML_RESTRICT x = vx;
2206 const block_q8_K * GGML_RESTRICT y = vy;
2207
2208 const int nb = n / QK_K;
2209
2210#if defined(__POWER9_VECTOR__)
2211 const vector signed char lowMask = vec_splats((signed char)0xF);
2212 const vector int v0 = vec_splats((int32_t)0);
2213 const vector unsigned char v4 = vec_splats((unsigned char)0x4);
2214
2215 vector float vsumf0 = vec_splats(0.0f);
2216 vector float vsumf1 = vec_splats(0.0f);
2217 vector float vsumf2 = vec_splats(0.0f);
2218 vector float vsumf3 = vec_splats(0.0f);
2219
2220 const vector signed char values = vec_xl( 0, kvalues_iq4nl);
2221
2222 for (int ibl = 0; ibl < nb; ++ibl) {
2223
2224 vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ibl].d));
2225 vector float vyd = vec_splats(y[ibl].d);
2226 vector float vd = vec_mul(vxd, vyd);
2227
2228 vector signed int vsumi0 = v0;
2229 vector signed int vsumi1 = v0;
2230 vector signed int vsumi2 = v0;
2231 vector signed int vsumi3 = v0;
2232
2233 uint16_t h = x[ibl].scales_h;
2234
2235 const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
2236 const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
2237 const int8_t * GGML_RESTRICT q8 = y[ibl].qs;
2238
2239 for (int ib = 0; ib < QK_K/64; ib ++ ) {
2240 __builtin_prefetch(q4, 0, 1);
2241 __builtin_prefetch(q8, 0, 1);
2242
2243 vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
2244 vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
2245 q4 += 32;
2246
2247 vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask);
2248 vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4);
2249 vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask);
2250 vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4);
2251
2252 q4x00 = vec_perm(values, values, (vector unsigned char)q4x00);
2253 q4x01 = vec_perm(values, values, (vector unsigned char)q4x01);
2254 q4x10 = vec_perm(values, values, (vector unsigned char)q4x10);
2255 q4x11 = vec_perm(values, values, (vector unsigned char)q4x11);
2256
2257 vector signed char q8y0 = vec_xl( 0, q8);
2258 vector signed char q8y1 = vec_xl(16, q8);
2259 vector signed char q8y2 = vec_xl(32, q8);
2260 vector signed char q8y3 = vec_xl(48, q8);
2261 q8 += 64;
2262
2263 vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0));
2264 vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1));
2265 vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2));
2266 vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3));
2267
2268 const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32);
2269 const uint16_t ls1 = (uint16_t)(((sc[0] >> 4) | ((h << 2) & 0x30)) - 32);
2270 h >>= 4;
2271 sc ++;
2272
2273 vector signed short vscales01 = vec_splats((int16_t)ls0);
2274 vector signed short vscales23 = vec_splats((int16_t)ls1);
2275
2276 vsumi0 = vec_msum(qv0, vscales01, vsumi0);
2277 vsumi1 = vec_msum(qv1, vscales01, vsumi1);
2278 vsumi2 = vec_msum(qv2, vscales23, vsumi2);
2279 vsumi3 = vec_msum(qv3, vscales23, vsumi3);
2280 }
2281
2282 vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
2283 vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
2284 vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
2285 vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
2286 }
2287
2288 vsumf0 = vec_add(vsumf0, vsumf2);
2289 vsumf1 = vec_add(vsumf1, vsumf3);
2290
2291 vsumf0 = vec_add(vsumf0, vsumf1);
2292
2293 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
2294 vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
2295
2296 *s = vec_extract(vsumf0, 0);
2297
2298#else
2299 UNUSED(x);
2300 UNUSED(y);
2301 UNUSED(nb);
2302 ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2303#endif
2304}
2305