1#define GGML_COMMON_IMPL_C
   2#include "ggml-common.h"
   3#include "ggml-quants.h"
   4#include "ggml-impl.h"
   5#include "ggml-cpu.h"
   6#include "simd-mappings.h"
   7
   8#include "../../quants.h"
   9#include "../../ggml-cpu-impl.h"
  10
  11#include <math.h>
  12#include <string.h>
  13#include <assert.h>
  14#include <float.h>
  15#include <stdlib.h> // for qsort
  16#include <stdio.h>  // for GGML_ASSERT
  17
  18#define GROUP_MAX_EPS 1e-15f
  19#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
  20#define GROUP_MAX_EPS_IQ2_S 1e-8f
  21#define GROUP_MAX_EPS_IQ1_M 1e-7f
  22#define GROUP_MAX_EPS_IQ1_S 1e-12f
  23
  24#define UNUSED GGML_UNUSED
  25
  26#if defined(__POWER9_VECTOR__)
  27#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
  28#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
  29#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
  30#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
  31#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
  32#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
  33#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
  34#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
  35
  36// precomputed tables for expanding 8bits to 8 bytes:
  37static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
  38static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
  39#endif
  40
  41void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  42    assert(QK8_0 == 32);
  43    assert(k % QK8_0 == 0);
  44    const int nb = k / QK8_0;
  45
  46    block_q8_0 * GGML_RESTRICT y = vy;
  47
  48#if defined(__POWER9_VECTOR__)
  49    for (int i = 0; i < nb; i++) {
  50        vector float srcv [8];
  51        vector float asrcv[8];
  52        vector float amaxv[8];
  53        vector signed int vi[8];
  54
  55        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
  56        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
  57
  58        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
  59        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
  60        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
  61
  62        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
  63                                   vec_extract(amaxv[0], 1)),
  64                               MAX(vec_extract(amaxv[0], 2),
  65                                   vec_extract(amaxv[0], 3)));
  66
  67        const float d = amax / ((1 << 7) - 1);
  68        const float id = d ? 1.0f/d : 0.0f;
  69        const vector float vid = vec_splats(id);
  70
  71        y[i].d = GGML_CPU_FP32_TO_FP16(d);
  72
  73        for (int j = 0; j < 8; j++) {
  74            const vector float v  = vec_round(vec_mul(srcv[j], vid));
  75            vi[j] = vec_cts(v, 0);
  76        }
  77        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
  78        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
  79    }
  80#else
  81    GGML_UNUSED(nb);
  82    // scalar
  83    quantize_row_q8_0_ref(x, y, k);
  84#endif
  85}
  86
  87void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  88    assert(k % QK8_1 == 0);
  89    const int nb = k / QK8_1;
  90
  91    block_q8_1 * GGML_RESTRICT y = vy;
  92
  93#if defined(__POWER9_VECTOR__)
  94    for (int i = 0; i < nb; i++) {
  95        vector float srcv [8];
  96        vector float asrcv[8];
  97        vector float amaxv[8];
  98        vector signed int vi[8];
  99
 100        for (int j = 0; j < 8; j++) srcv[j]  = vec_xl(0, x + i*32 + 4*j);
 101        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
 102
 103        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
 104        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
 105        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
 106
 107        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
 108                                   vec_extract(amaxv[0], 1)),
 109                               MAX(vec_extract(amaxv[0], 2),
 110                                   vec_extract(amaxv[0], 3)));
 111
 112        const float d = amax / ((1 << 7) - 1);
 113        const float id = d ? 1.0f/d : 0.0f;
 114        const vector float vid = vec_splats(id);
 115
 116        y[i].d = GGML_CPU_FP32_TO_FP16(d);
 117
 118        vector int accv = vec_splats(0);
 119
 120        for (int j = 0; j < 8; j++) {
 121            const vector float v  = vec_round(vec_mul(srcv[j], vid));
 122            vi[j] = vec_cts(v, 0);
 123
 124            accv = vec_add(accv, vi[j]);
 125        }
 126        vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])),  0, &y[i].qs[0]);
 127        vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
 128
 129        accv = vec_add(accv, vec_sld(accv, accv, 4));
 130        accv = vec_add(accv, vec_sld(accv, accv, 8));
 131        y[i].s = GGML_CPU_FP32_TO_FP16(d * vec_extract(accv, 0));
 132    }
 133
 134#else
 135    GGML_UNUSED(nb);
 136    // scalar
 137    quantize_row_q8_1_ref(x, y, k);
 138#endif
 139}
 140
 141
 142//===================================== Dot products =================================
 143
 144void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 145    const int qk = QK8_0;
 146    const int nb = n / qk;
 147
 148    assert(n % qk == 0);
 149    assert(nrc == 1);
 150    UNUSED(nrc);
 151    UNUSED(bx);
 152    UNUSED(by);
 153    UNUSED(bs);
 154
 155    const block_q4_0 * GGML_RESTRICT x = vx;
 156    const block_q8_0 * GGML_RESTRICT y = vy;
 157
 158    int ib = 0;
 159    float sumf = 0;
 160
 161#if defined(__POWER9_VECTOR__)
 162    const vector signed char lowMask = vec_splats((signed char)0xF);
 163    const vector signed int v0 = vec_splats((int32_t)0);
 164    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
 165    const vector signed char v8 = vec_splats((signed char)0x8);
 166
 167    vector float vsumf0 = vec_splats(0.0f);
 168
 169#pragma GCC unroll 8
 170    for (; ib < nb; ++ib) {
 171        __builtin_prefetch(x[ib].qs, 0, 1);
 172        __builtin_prefetch(y[ib].qs, 0, 1);
 173
 174        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
 175        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
 176        vector float vd = vec_mul(vxd, vyd);
 177
 178        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
 179        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
 180        vector signed char q8y1 = vec_xl(16, y[ib].qs);
 181
 182        vector signed char q4x0 = vec_and(qxs, lowMask);
 183        vector signed char q4x1 = vec_sr(qxs, v4);
 184
 185        q4x0 = vec_sub(q4x0, v8);
 186        q4x1 = vec_sub(q4x1, v8);
 187
 188        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
 189        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
 190
 191        vector signed int vsumi0 = v0;
 192
 193        vsumi0 = vec_sum4s(qv0, vsumi0);
 194        vsumi0 = vec_sum4s(qv1, vsumi0);
 195
 196        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
 197    }
 198
 199    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
 200    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
 201
 202    sumf = vec_extract(vsumf0, 0);
 203
 204    *s = sumf;
 205#else
 206    UNUSED(x);
 207    UNUSED(y);
 208    UNUSED(ib);
 209    UNUSED(sumf);
 210    ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 211#endif
 212}
 213
 214void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 215    const int qk = QK8_1;
 216    const int nb = n / qk;
 217
 218    assert(n % qk == 0);
 219    assert(nrc == 1);
 220    UNUSED(nrc);
 221    UNUSED(bx);
 222    UNUSED(by);
 223    UNUSED(bs);
 224
 225    const block_q4_1 * GGML_RESTRICT x = vx;
 226    const block_q8_1 * GGML_RESTRICT y = vy;
 227
 228    int ib = 0;
 229    float sumf = 0;
 230
 231#if defined(__POWER9_VECTOR__)
 232    const vector signed char lowMask = vec_splats((signed char)0xF);
 233    const vector signed int v0 = vec_splats((int32_t)0);
 234    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
 235
 236    vector float vsumf0 = vec_splats(0.0f);
 237
 238#pragma GCC unroll 4
 239    for (; ib < nb; ++ib) {
 240        __builtin_prefetch(x[ib].qs, 0, 1);
 241        __builtin_prefetch(y[ib].qs, 0, 1);
 242
 243        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
 244        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
 245        vector float vd = vec_mul(vxd, vyd);
 246
 247        vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
 248        vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
 249        vsumf0 = vec_madd(vxmin, vys, vsumf0);
 250
 251        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
 252        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
 253        vector signed char q8y1 = vec_xl(16, y[ib].qs);
 254
 255        vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
 256        vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
 257
 258        vector signed int vsumi0 = v0;
 259
 260        vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
 261        vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
 262
 263        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
 264    }
 265
 266    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
 267    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
 268
 269    sumf = vec_extract(vsumf0, 0);
 270
 271    *s = sumf;
 272#else
 273    UNUSED(x);
 274    UNUSED(y);
 275    UNUSED(ib);
 276    UNUSED(sumf);
 277    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 278#endif
 279}
 280
 281void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 282    assert(nrc == 1);
 283    UNUSED(nrc);
 284    UNUSED(bx);
 285    UNUSED(by);
 286    UNUSED(bs);
 287    assert(n % QK_MXFP4 == 0);
 288    static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
 289
 290    const block_mxfp4 * GGML_RESTRICT x = vx;
 291    const block_q8_0 * GGML_RESTRICT y = vy;
 292
 293    const int nb = n / QK_MXFP4;
 294
 295    int ib = 0;
 296    float sumf = 0;
 297
 298#if defined(__POWER9_VECTOR__)
 299    const vector signed char lowMask = vec_splats((signed char)0xF);
 300    const vector unsigned char vshift4 = vec_splats((unsigned char)4);
 301    vector float vsumf0 = vec_splats(0.0f);
 302
 303    vector signed char kv = vec_xl(0, (const signed char *)kvalues_mxfp4);
 304
 305#pragma GCC unroll 8
 306    for (; ib < nb; ++ib) {
 307        __builtin_prefetch(x[ib].qs, 0, 1);
 308        __builtin_prefetch(y[ib].qs, 0, 1);
 309
 310        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d) *
 311                                      GGML_E8M0_TO_FP32_HALF(x[ib].e));
 312
 313        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
 314        vector signed char q8y1 = vec_xl(16, y[ib].qs);
 315
 316        vector signed char qxs = (vector signed char)vec_xl(0, x[ib].qs);
 317
 318        vector unsigned char lo_nibbles = (vector unsigned char)vec_and(qxs, lowMask);
 319        vector unsigned char hi_nibbles = (vector unsigned char)vec_sr(qxs, vshift4);
 320
 321        vector signed char q4x0 = vec_perm(kv, kv, lo_nibbles);
 322        vector signed char q4x1 = vec_perm(kv, kv, hi_nibbles);
 323
 324        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
 325        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
 326
 327        vector signed int vsumi0 = vec_splats((int32_t)0);
 328        vsumi0 = vec_sum4s(qv0, vsumi0);
 329        vsumi0 = vec_sum4s(qv1, vsumi0);
 330
 331        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vyd, vsumf0);
 332    }
 333
 334    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
 335    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
 336    sumf = vec_extract(vsumf0, 0);
 337    *s = sumf;
 338#else
 339    UNUSED(x);
 340    UNUSED(y);
 341    UNUSED(ib);
 342    UNUSED(sumf);
 343    ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 344#endif
 345}
 346
 347void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 348    const int qk = QK8_0;
 349    const int nb = n / qk;
 350
 351    int ib = 0;
 352    float sumf = 0;
 353
 354    assert(n % qk == 0);
 355    assert(qk == QK5_0);
 356    assert(nrc == 1);
 357    UNUSED(nrc);
 358    UNUSED(bx);
 359    UNUSED(by);
 360    UNUSED(bs);
 361
 362    const block_q5_0 * GGML_RESTRICT x = vx;
 363    const block_q8_0 * GGML_RESTRICT y = vy;
 364
 365#if defined(__POWER9_VECTOR__)
 366    const vector signed char lowMask = vec_splats((signed char)0xF);
 367    const vector unsigned char v4 = vec_splats((unsigned char)4);
 368
 369    vector float vsumf0 = vec_splats(0.0f);
 370
 371#pragma GCC unroll 4
 372    for (; ib < nb; ++ib) {
 373        __builtin_prefetch(x[ib].qs, 0, 1);
 374        __builtin_prefetch(y[ib].qs, 0, 1);
 375
 376        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
 377        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
 378        vector float vd = vec_mul(vxd, vyd);
 379
 380        vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])};
 381        vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[ib].qh[2]]), (uint64_t)(table_b2b_1[x[ib].qh[3]])};
 382
 383        vector signed char qh0 = (vector signed char)aux64x2_0;
 384        vector signed char qh1 = (vector signed char)aux64x2_1;
 385
 386        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
 387
 388        vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0);
 389        vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1);
 390
 391        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
 392        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
 393
 394        vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
 395        vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
 396
 397        qv0 = vec_add(qv0, qv1);
 398
 399        vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
 400
 401        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
 402    }
 403
 404    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
 405    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
 406
 407    sumf = vec_extract(vsumf0, 0);
 408
 409    *s = sumf;
 410#else
 411    UNUSED(ib);
 412    UNUSED(sumf);
 413    UNUSED(x);
 414    UNUSED(y);
 415    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 416#endif
 417}
 418
 419void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 420    const int qk = QK8_1;
 421    const int nb = n / qk;
 422
 423    int ib = 0;
 424    float sumf = 0;
 425
 426    assert(n % qk == 0);
 427    assert(qk == QK5_1);
 428    assert(nrc == 1);
 429    UNUSED(nrc);
 430    UNUSED(bx);
 431    UNUSED(by);
 432    UNUSED(bs);
 433
 434    const block_q5_1 * GGML_RESTRICT x = vx;
 435    const block_q8_1 * GGML_RESTRICT y = vy;
 436
 437#if defined(__POWER9_VECTOR__)
 438    const vector signed char lowMask = vec_splats((signed char)0xF);
 439    const vector signed int v0 = vec_splats((int32_t)0);
 440    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
 441
 442    vector float vsumf0 = vec_splats(0.0f);
 443
 444#pragma GCC unroll 4
 445    for (; ib < nb; ++ib) {
 446        __builtin_prefetch(x[ib].qs, 0, 1);
 447        __builtin_prefetch(y[ib].qs, 0, 1);
 448
 449        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
 450        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
 451        vector float vd = vec_mul(vxd, vyd);
 452
 453        vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
 454        vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
 455        vsumf0 = vec_madd(vxmin, vys, vsumf0);
 456
 457        vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])};
 458        vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[ib].qh[2]]), (uint64_t)(table_b2b_0[x[ib].qh[3]])};
 459
 460        vector signed char qh0 = (vector signed char)aux64x2_0;
 461        vector signed char qh1 = (vector signed char)aux64x2_1;
 462
 463        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
 464
 465        vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
 466        vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
 467
 468        vector signed char q8y0 = vec_xl(  0, y[ib].qs);
 469        vector signed char q8y1 = vec_xl( 16, y[ib].qs);
 470
 471        vector signed int vsumi0 = v0;
 472
 473        vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
 474        vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
 475
 476        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
 477    }
 478
 479    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
 480    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
 481
 482    sumf = vec_extract(vsumf0, 0);
 483
 484    *s = sumf;
 485#else
 486    UNUSED(nb);
 487    UNUSED(ib);
 488    UNUSED(sumf);
 489    UNUSED(x);
 490    UNUSED(y);
 491    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 492#endif
 493}
 494
 495void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 496    const int qk = QK8_0;
 497    const int nb = n / qk;
 498
 499    assert(n % qk == 0);
 500    assert(nrc == 1);
 501    UNUSED(nrc);
 502    UNUSED(bx);
 503    UNUSED(by);
 504    UNUSED(bs);
 505
 506    const block_q8_0 * GGML_RESTRICT x = vx;
 507    const block_q8_0 * GGML_RESTRICT y = vy;
 508
 509    int ib = 0;
 510    float sumf = 0;
 511
 512#if defined(__POWER9_VECTOR__)
 513    const vector signed int v0 = vec_splats((int32_t)0);
 514    vector float vsumf0 = vec_splats(0.0f);
 515
 516#pragma GCC unroll 8
 517    for (; ib < nb; ++ib) {
 518        __builtin_prefetch(x[ib].qs, 0, 1);
 519        __builtin_prefetch(y[ib].qs, 0, 1);
 520
 521        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
 522        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
 523        vector float vd = vec_mul(vxd, vyd);
 524
 525        vector signed char q8x0 = vec_xl( 0, x[ib].qs);
 526        vector signed char q8x1 = vec_xl(16, x[ib].qs);
 527        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
 528        vector signed char q8y1 = vec_xl(16, y[ib].qs);
 529
 530        vector signed short qv0 = vec_mule(q8x0, q8y0);
 531        vector signed short qv1 = vec_mulo(q8x0, q8y0);
 532        vector signed short qv2 = vec_mule(q8x1, q8y1);
 533        vector signed short qv3 = vec_mulo(q8x1, q8y1);
 534
 535        vector signed int vsumi0 = v0;
 536        vector signed int vsumi1 = v0;
 537
 538        vsumi0 = vec_sum4s(qv0, vsumi0);
 539        vsumi1 = vec_sum4s(qv1, vsumi1);
 540        vsumi0 = vec_sum4s(qv2, vsumi0);
 541        vsumi1 = vec_sum4s(qv3, vsumi1);
 542
 543        vsumi0 = vec_add(vsumi0, vsumi1);
 544
 545        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
 546    }
 547
 548    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
 549    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
 550
 551    sumf = vec_extract(vsumf0, 0);
 552
 553    *s = sumf;
 554#else
 555    UNUSED(nb);
 556    UNUSED(x);
 557    UNUSED(y);
 558    UNUSED(ib);
 559    UNUSED(sumf);
 560    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 561#endif
 562}
 563
 564void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 565    assert(nrc == 1);
 566    UNUSED(nrc);
 567    UNUSED(bx);
 568    UNUSED(by);
 569    UNUSED(bs);
 570
 571    const block_q2_K * GGML_RESTRICT x = vx;
 572    const block_q8_K * GGML_RESTRICT y = vy;
 573
 574    const int nb = n / QK_K;
 575
 576#if defined(__POWER9_VECTOR__)
 577    const vector signed char lowMask = vec_splats((signed char)0x3);
 578    const vector signed char lowScaleMask = vec_splats((signed char)0xF);
 579    const vector int v0 = vec_splats((int32_t)0);
 580    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
 581    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
 582    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
 583
 584    vector float vsumf0 = vec_splats(0.0f);
 585    vector float vsumf1 = vec_splats(0.0f);
 586    vector float vsumf2 = vec_splats(0.0f);
 587    vector float vsumf3 = vec_splats(0.0f);
 588
 589    for (int i = 0; i < nb; ++i) {
 590        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
 591        vector float vyd = vec_splats(y[i].d);
 592        vector float vd = vec_mul(vxd, vyd);
 593
 594        vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
 595        vector float vdmin = vec_mul(vxmin, vyd);
 596
 597        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
 598        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
 599
 600        vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales);
 601        vector signed char vscales = vec_and(q2xmins, lowScaleMask);
 602
 603        q2xmins = vec_sr(q2xmins, v4);
 604        vector signed short q2xmins0 = vec_unpackh(q2xmins);
 605        vector signed short q2xmins1 = vec_unpackl(q2xmins);
 606
 607        vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
 608        vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
 609        vector signed int prod2 = vec_mule(q2xmins1, q8ysums1);
 610        vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1);
 611
 612        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
 613        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
 614        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
 615        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
 616
 617        vector signed int vsumi0 = v0;
 618        vector signed int vsumi1 = v0;
 619        vector signed int vsumi2 = v0;
 620        vector signed int vsumi3 = v0;
 621        vector signed int vsumi4 = v0;
 622        vector signed int vsumi5 = v0;
 623        vector signed int vsumi6 = v0;
 624        vector signed int vsumi7 = v0;
 625
 626        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
 627        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 628
 629        for (int j = 0; j < QK_K/128; ++j) {
 630            __builtin_prefetch(q2, 0, 1);
 631            __builtin_prefetch(q8, 0, 1);
 632
 633            vector signed char qxs0 = (vector signed char)vec_xl( 0, q2);
 634            vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
 635            q2 += 32;
 636
 637            vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
 638            vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
 639            vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
 640            vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
 641            vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
 642            vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
 643            vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
 644            vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
 645
 646            vector signed char q8y00 = vec_xl(  0, q8);
 647            vector signed char q8y10 = vec_xl( 16, q8);
 648            vector signed char q8y01 = vec_xl( 32, q8);
 649            vector signed char q8y11 = vec_xl( 48, q8);
 650            vector signed char q8y02 = vec_xl( 64, q8);
 651            vector signed char q8y12 = vec_xl( 80, q8);
 652            vector signed char q8y03 = vec_xl( 96, q8);
 653            vector signed char q8y13 = vec_xl(112, q8);
 654            q8 += 128;
 655
 656            vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
 657            vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
 658            vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
 659            vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
 660            vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
 661            vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
 662            vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
 663            vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
 664
 665            vector signed short vscales_07 = vec_unpackh(vscales);
 666            vector signed int vscales_03 = vec_unpackh(vscales_07);
 667            vector signed int vscales_47 = vec_unpackl(vscales_07);
 668            vector signed int vs0 = vec_splat(vscales_03, 0);
 669            vector signed int vs1 = vec_splat(vscales_03, 1);
 670            vector signed int vs2 = vec_splat(vscales_03, 2);
 671            vector signed int vs3 = vec_splat(vscales_03, 3);
 672            vector signed int vs4 = vec_splat(vscales_47, 0);
 673            vector signed int vs5 = vec_splat(vscales_47, 1);
 674            vector signed int vs6 = vec_splat(vscales_47, 2);
 675            vector signed int vs7 = vec_splat(vscales_47, 3);
 676            vscales = vec_sld(vscales, vscales, 8);
 677
 678            vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
 679            vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
 680            vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
 681            vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
 682            vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
 683            vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
 684            vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
 685            vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
 686        }
 687
 688        vsumi0 = vec_add(vsumi0, vsumi4);
 689        vsumi1 = vec_add(vsumi1, vsumi5);
 690        vsumi2 = vec_add(vsumi2, vsumi6);
 691        vsumi3 = vec_add(vsumi3, vsumi7);
 692
 693        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
 694        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
 695        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
 696        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
 697    }
 698
 699    vsumf0 = vec_add(vsumf0, vsumf2);
 700    vsumf1 = vec_add(vsumf1, vsumf3);
 701
 702    vsumf0 = vec_add(vsumf0, vsumf1);
 703
 704    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
 705    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
 706
 707    *s = vec_extract(vsumf0, 0);
 708
 709#else
 710    UNUSED(x);
 711    UNUSED(y);
 712    UNUSED(nb);
 713    ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
 714#endif
 715}
 716
 717void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 718    assert(n % QK_K == 0);
 719    assert(nrc == 1);
 720    UNUSED(nrc);
 721    UNUSED(bx);
 722    UNUSED(by);
 723    UNUSED(bs);
 724
 725    const uint32_t kmask1 = 0x03030303;
 726    const uint32_t kmask2 = 0x0f0f0f0f;
 727
 728    const block_q3_K * GGML_RESTRICT x = vx;
 729    const block_q8_K * GGML_RESTRICT y = vy;
 730
 731    const int nb = n / QK_K;
 732
 733#if defined(__POWER9_VECTOR__)
 734    const vector signed char lowMask = vec_splats((signed char)0x3);
 735    const vector signed char lowMask1 = vec_splats((int8_t)0xf);
 736    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
 737    const vector int v0 = vec_splats((int32_t)0);
 738    const vector signed char v1 = vec_splats((signed char)0x1);
 739    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
 740    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
 741    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
 742    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
 743    const vector signed char off = vec_splats((signed char)0x20);
 744
 745    vector float vsumf0 = vec_splats(0.0f);
 746    vector float vsumf1 = vec_splats(0.0f);
 747    vector float vsumf2 = vec_splats(0.0f);
 748    vector float vsumf3 = vec_splats(0.0f);
 749
 750    for (int i = 0; i < nb; ++i) {
 751        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
 752        vector float vyd = vec_splats(y[i].d);
 753        vector float vd = vec_mul(vxd, vyd);
 754
 755        UNUSED(kmask1);
 756        UNUSED(kmask2);
 757
 758        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
 759        vector signed char u1 = vec_and(u0, lowMask1);
 760        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
 761        vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
 762        vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
 763        vector signed char u31 = vec_and(u3, lowMask2);
 764
 765        u1 = vec_or(u1, u30);
 766        u2 = vec_or(vec_sr(u0, v4), u31);
 767
 768        vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
 769        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
 770        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
 771
 772        vscales = vec_sub(vscales, off);
 773
 774        vector signed int vsumi0 = v0;
 775        vector signed int vsumi1 = v0;
 776        vector signed int vsumi2 = v0;
 777        vector signed int vsumi3 = v0;
 778        vector signed int vsumi4 = v0;
 779        vector signed int vsumi5 = v0;
 780        vector signed int vsumi6 = v0;
 781        vector signed int vsumi7 = v0;
 782
 783        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
 784        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 785
 786        for (int j = 0; j < QK_K/128; ++j) {
 787            __builtin_prefetch(q3, 0, 1);
 788            __builtin_prefetch(q8, 0, 1);
 789
 790            vector signed char qxs0 = (vector signed char)vec_xl( 0, q3);
 791            vector signed char qxs1 = (vector signed char)vec_xl(16, q3);
 792            q3 += 32;
 793
 794            //the low 2 bits
 795            vector signed char qxs00 = vec_and(qxs0, lowMask);
 796            vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
 797            vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask);
 798            vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask);
 799            vector signed char qxs10 = vec_and(qxs1, lowMask);
 800            vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask);
 801            vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask);
 802            vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask);
 803
 804            //the 3rd bit
 805            vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
 806            vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2);
 807            vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
 808            vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2);
 809            vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2);
 810            vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2);
 811            vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2);
 812            vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2);
 813            qxhs0 = vec_sr(qxhs0, v4);
 814            qxhs1 = vec_sr(qxhs1, v4);
 815
 816            vector signed char q3x00 = vec_sub(qxs00, qxh00);
 817            vector signed char q3x01 = vec_sub(qxs01, qxh01);
 818            vector signed char q3x02 = vec_sub(qxs02, qxh02);
 819            vector signed char q3x03 = vec_sub(qxs03, qxh03);
 820            vector signed char q3x10 = vec_sub(qxs10, qxh10);
 821            vector signed char q3x11 = vec_sub(qxs11, qxh11);
 822            vector signed char q3x12 = vec_sub(qxs12, qxh12);
 823            vector signed char q3x13 = vec_sub(qxs13, qxh13);
 824
 825            vector signed char q8y00 = vec_xl(  0, q8);
 826            vector signed char q8y10 = vec_xl( 16, q8);
 827            vector signed char q8y01 = vec_xl( 32, q8);
 828            vector signed char q8y11 = vec_xl( 48, q8);
 829            vector signed char q8y02 = vec_xl( 64, q8);
 830            vector signed char q8y12 = vec_xl( 80, q8);
 831            vector signed char q8y03 = vec_xl( 96, q8);
 832            vector signed char q8y13 = vec_xl(112, q8);
 833            q8 += 128;
 834
 835            vector signed short vscales_h = vec_unpackh(vscales);
 836            vector signed short vs0 = vec_splat(vscales_h, 0);
 837            vector signed short vs1 = vec_splat(vscales_h, 1);
 838            vector signed short vs2 = vec_splat(vscales_h, 2);
 839            vector signed short vs3 = vec_splat(vscales_h, 3);
 840            vector signed short vs4 = vec_splat(vscales_h, 4);
 841            vector signed short vs5 = vec_splat(vscales_h, 5);
 842            vector signed short vs6 = vec_splat(vscales_h, 6);
 843            vector signed short vs7 = vec_splat(vscales_h, 7);
 844            vscales = vec_sld(vscales, vscales, 8);
 845
 846            vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
 847            vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
 848            vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02));
 849            vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03));
 850            vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
 851            vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
 852            vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
 853            vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
 854
 855            vsumi0 = vec_msum(qv00, vs0, vsumi0);
 856            vsumi1 = vec_msum(qv01, vs2, vsumi1);
 857            vsumi2 = vec_msum(qv02, vs4, vsumi2);
 858            vsumi3 = vec_msum(qv03, vs6, vsumi3);
 859            vsumi4 = vec_msum(qv10, vs1, vsumi4);
 860            vsumi5 = vec_msum(qv11, vs3, vsumi5);
 861            vsumi6 = vec_msum(qv12, vs5, vsumi6);
 862            vsumi7 = vec_msum(qv13, vs7, vsumi7);
 863        }
 864
 865        vsumi0 = vec_add(vsumi0, vsumi4);
 866        vsumi1 = vec_add(vsumi1, vsumi5);
 867        vsumi2 = vec_add(vsumi2, vsumi6);
 868        vsumi3 = vec_add(vsumi3, vsumi7);
 869
 870        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
 871        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
 872        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
 873        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
 874    }
 875
 876    vsumf0 = vec_add(vsumf0, vsumf2);
 877    vsumf1 = vec_add(vsumf1, vsumf3);
 878
 879    vsumf0 = vec_add(vsumf0, vsumf1);
 880
 881    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
 882    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
 883
 884    *s = vec_extract(vsumf0, 0);
 885
 886#else
 887    UNUSED(kmask1);
 888    UNUSED(kmask2);
 889    UNUSED(x);
 890    UNUSED(y);
 891    UNUSED(nb);
 892    ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
 893#endif
 894}
 895
 896void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 897    assert(n % QK_K == 0);
 898    assert(nrc == 1);
 899    UNUSED(nrc);
 900    UNUSED(bx);
 901    UNUSED(by);
 902    UNUSED(bs);
 903
 904    const block_q4_K * GGML_RESTRICT x = vx;
 905    const block_q8_K * GGML_RESTRICT y = vy;
 906
 907    const int nb = n / QK_K;
 908
 909    static const uint32_t kmask1 = 0x3f3f3f3f;
 910    static const uint32_t kmask2 = 0x0f0f0f0f;
 911    static const uint32_t kmask3 = 0x03030303;
 912
 913    uint32_t utmp[4];
 914
 915#if defined(__POWER9_VECTOR__)
 916    const vector signed char lowMask = vec_splats((signed char)0xF);
 917    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
 918    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
 919    const vector int v0 = vec_splats((int32_t)0);
 920    const vector unsigned char v2 = vec_splats((uint8_t)2);
 921    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
 922
 923    vector float vsumf0 = vec_splats(0.0f);
 924    vector float vsumf1 = vec_splats(0.0f);
 925    vector float vsumf2 = vec_splats(0.0f);
 926    vector float vsumf3 = vec_splats(0.0f);
 927
 928    for (int i = 0; i < nb; ++i) {
 929        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
 930        vector float vyd = vec_splats(y[i].d);
 931        vector float vd = vec_mul(vxd, vyd);
 932
 933        vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
 934        vector float vdmin = vec_mul(vxmin, vyd);
 935
 936        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
 937        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
 938
 939        UNUSED(kmask1);
 940        UNUSED(kmask2);
 941        UNUSED(kmask3);
 942        UNUSED(utmp);
 943
 944        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
 945        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
 946        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
 947        vector signed char u3 = vec_sr(u2, v4);
 948
 949        vector signed char u30 = u1;
 950        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
 951
 952        u1 = vec_and(u0, lowMask1);
 953        u2 = vec_or(u30, u31);
 954
 955        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
 956
 957        vector signed short vscales = vec_unpackh(utmps);
 958        vector signed short q4xmins = vec_unpackl(utmps);
 959        vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
 960        vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins);
 961
 962        vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
 963        vector signed int prod1 = vec_mule(q4xmins1, q8ysums1);
 964        vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0);
 965        vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1);
 966
 967        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
 968        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
 969        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
 970        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
 971
 972        vector signed int vsumi0 = v0;
 973        vector signed int vsumi1 = v0;
 974        vector signed int vsumi2 = v0;
 975        vector signed int vsumi3 = v0;
 976
 977        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
 978        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 979
 980        for (int j = 0; j < QK_K/64; j+=2) {
 981            __builtin_prefetch(q4, 0, 1);
 982            __builtin_prefetch(q8, 0, 1);
 983
 984            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
 985            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
 986            vector signed char qxs2 = (vector signed char)vec_xl(32, q4);
 987            vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
 988            q4 += 64;
 989
 990            vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
 991            vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
 992            vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
 993            vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
 994            vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
 995            vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
 996            vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
 997            vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
 998
 999            vector signed char q8y00 = vec_xl(  0, q8);
1000            vector signed char q8y10 = vec_xl( 16, q8);
1001            vector signed char q8y01 = vec_xl( 32, q8);
1002            vector signed char q8y11 = vec_xl( 48, q8);
1003            vector signed char q8y20 = vec_xl( 64, q8);
1004            vector signed char q8y30 = vec_xl( 80, q8);
1005            vector signed char q8y21 = vec_xl( 96, q8);
1006            vector signed char q8y31 = vec_xl(112, q8);
1007            q8 += 128;
1008
1009            vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
1010            vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
1011            vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
1012            vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
1013            vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
1014            vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
1015            vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
1016            vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
1017
1018            vector signed int vscales_h = vec_unpackh(vscales);
1019            vector signed int vs0 = vec_splat(vscales_h, 0);
1020            vector signed int vs1 = vec_splat(vscales_h, 1);
1021            vector signed int vs2 = vec_splat(vscales_h, 2);
1022            vector signed int vs3 = vec_splat(vscales_h, 3);
1023            vscales = vec_sld(vscales, vscales, 8);
1024
1025            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
1026            vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
1027            vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
1028            vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
1029
1030            vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
1031            vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
1032            vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
1033            vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
1034        }
1035
1036        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1037        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1038        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1039        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1040    }
1041
1042    vsumf0 = vec_add(vsumf0, vsumf2);
1043    vsumf1 = vec_add(vsumf1, vsumf3);
1044
1045    vsumf0 = vec_add(vsumf0, vsumf1);
1046
1047    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1048    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1049
1050    *s = vec_extract(vsumf0, 0);
1051
1052#else
1053    UNUSED(x);
1054    UNUSED(y);
1055    UNUSED(nb);
1056    UNUSED(kmask1);
1057    UNUSED(kmask2);
1058    UNUSED(kmask3);
1059    UNUSED(utmp);
1060    ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1061#endif
1062}
1063
1064void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
1065    assert(n % QK_K == 0);
1066    assert(nrc == 1);
1067    UNUSED(nrc);
1068    UNUSED(bx);
1069    UNUSED(by);
1070    UNUSED(bs);
1071
1072    const block_q5_K * GGML_RESTRICT x = vx;
1073    const block_q8_K * GGML_RESTRICT y = vy;
1074
1075    const int nb = n / QK_K;
1076
1077    static const uint32_t kmask1 = 0x3f3f3f3f;
1078    static const uint32_t kmask2 = 0x0f0f0f0f;
1079    static const uint32_t kmask3 = 0x03030303;
1080
1081    uint32_t utmp[4];
1082
1083#if defined(__POWER9_VECTOR__)
1084    const vector signed char lowMask = vec_splats((signed char)0xF);
1085    const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
1086    const vector signed char lowMask2 = vec_splats((int8_t)0x30);
1087    const vector int v0 = vec_splats((int32_t)0);
1088    const vector unsigned char v1 = vec_splats((unsigned char)0x1);
1089    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
1090    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
1091    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
1092
1093    vector float vsumf0 = vec_splats(0.0f);
1094    vector float vsumf1 = vec_splats(0.0f);
1095    vector float vsumf2 = vec_splats(0.0f);
1096    vector float vsumf3 = vec_splats(0.0f);
1097
1098    for (int i = 0; i < nb; ++i) {
1099        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1100        vector float vyd = vec_splats(y[i].d);
1101        vector float vd = vec_mul(vxd, vyd);
1102
1103        vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
1104        vector float vdmin = vec_mul(vxmin, vyd);
1105
1106        UNUSED(kmask1);
1107        UNUSED(kmask2);
1108        UNUSED(kmask3);
1109        UNUSED(utmp);
1110
1111        vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
1112        vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
1113        vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
1114        vector signed char u3 = vec_sr(u2, v4);
1115
1116        vector signed char u30 = u1;
1117        vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
1118
1119        u1 = vec_and(u0, lowMask1);
1120        u2 = vec_or(u30, u31);
1121
1122        vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
1123
1124        vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
1125        vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
1126
1127        vector signed short vscales = vec_unpackh(utmps);
1128
1129        vector signed short q5xmins = vec_unpackl(utmps);
1130        vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins);
1131        vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins);
1132
1133        vector signed int prod0 = vec_mule(q5xmins0, q8ysums0);
1134        vector signed int prod1 = vec_mule(q5xmins1, q8ysums1);
1135        vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0);
1136        vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1);
1137
1138        vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
1139        vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
1140        vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
1141        vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
1142
1143        vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
1144        vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
1145
1146        vector signed int vsumi0 = v0;
1147        vector signed int vsumi1 = v0;
1148        vector signed int vsumi2 = v0;
1149        vector signed int vsumi3 = v0;
1150
1151        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
1152        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1153
1154        for (int j = 0; j < QK_K/64; ++j) {
1155            __builtin_prefetch(q5, 0, 1);
1156            __builtin_prefetch(q8, 0, 1);
1157
1158            vector signed char qxs0 = (vector signed char)vec_xl( 0, q5);
1159            vector signed char qxs1 = (vector signed char)vec_xl(16, q5);
1160            q5 += 32;
1161
1162            vector signed char qxs00 = vec_and(qxs0, lowMask);
1163            vector signed char qxs01 = vec_sr(qxs0, v4);
1164            vector signed char qxs10 = vec_and(qxs1, lowMask);
1165            vector signed char qxs11 = vec_sr(qxs1, v4);
1166
1167            vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4);
1168            vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3);
1169            vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4);
1170            vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3);
1171            qxhs0 = vec_sr(qxhs0, v2);
1172            qxhs1 = vec_sr(qxhs1, v2);
1173
1174            vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
1175            vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
1176            vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
1177            vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
1178
1179            vector signed char q8y00 = vec_xl( 0, q8);
1180            vector signed char q8y10 = vec_xl(16, q8);
1181            vector signed char q8y01 = vec_xl(32, q8);
1182            vector signed char q8y11 = vec_xl(48, q8);
1183            q8 += 64;
1184
1185            vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
1186            vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
1187            vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
1188            vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
1189
1190            vector signed int vscales_h = vec_unpackh(vscales);
1191            vector signed int vs0 = vec_splat(vscales_h, 0);
1192            vector signed int vs1 = vec_splat(vscales_h, 1);
1193            vscales = vec_sld(vscales, vscales, 12);
1194
1195            vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
1196            vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
1197            vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
1198            vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
1199        }
1200
1201        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1202        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1203        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1204        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1205    }
1206
1207    vsumf0 = vec_add(vsumf0, vsumf2);
1208    vsumf1 = vec_add(vsumf1, vsumf3);
1209
1210    vsumf0 = vec_add(vsumf0, vsumf1);
1211
1212    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1213    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1214
1215    *s = vec_extract(vsumf0, 0);
1216
1217#else
1218    UNUSED(x);
1219    UNUSED(y);
1220    UNUSED(nb);
1221    UNUSED(kmask1);
1222    UNUSED(kmask2);
1223    UNUSED(kmask3);
1224    UNUSED(utmp);
1225    ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1226#endif
1227}
1228
1229void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1230    assert(n % QK_K == 0);
1231    assert(nrc == 1);
1232    UNUSED(nrc);
1233    UNUSED(bx);
1234    UNUSED(by);
1235    UNUSED(bs);
1236
1237    const block_q6_K * GGML_RESTRICT x = vx;
1238    const block_q8_K * GGML_RESTRICT y = vy;
1239
1240    const int nb = n / QK_K;
1241
1242#if defined(__POWER9_VECTOR__)
1243    const vector signed char lowMask = vec_splats((signed char)0xF);
1244    const vector int v0 = vec_splats((int32_t)0);
1245    const vector unsigned char v2 = vec_splats((unsigned char)0x2);
1246    const vector unsigned char v3 = vec_splats((unsigned char)0x3);
1247    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
1248    const vector unsigned char v6 = vec_splats((unsigned char)0x6);
1249    const vector signed char off = vec_splats((signed char)0x20);
1250
1251    vector float vsumf0 = vec_splats(0.0f);
1252    vector float vsumf1 = vec_splats(0.0f);
1253    vector float vsumf2 = vec_splats(0.0f);
1254    vector float vsumf3 = vec_splats(0.0f);
1255
1256    for (int i = 0; i < nb; ++i) {
1257        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1258        vector float vyd = vec_splats(y[i].d);
1259        vector float vd = vec_mul(vxd, vyd);
1260
1261        vector signed int vsumi0 = v0;
1262        vector signed int vsumi1 = v0;
1263        vector signed int vsumi2 = v0;
1264        vector signed int vsumi3 = v0;
1265        vector signed int vsumi4 = v0;
1266        vector signed int vsumi5 = v0;
1267        vector signed int vsumi6 = v0;
1268        vector signed int vsumi7 = v0;
1269
1270        const uint8_t * GGML_RESTRICT q6 = x[i].ql;
1271        const uint8_t * GGML_RESTRICT qh = x[i].qh;
1272        const int8_t  * GGML_RESTRICT qs = x[i].scales;
1273        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1274
1275        for (int j = 0; j < QK_K/128; ++j) {
1276            __builtin_prefetch(q6, 0, 0);
1277            __builtin_prefetch(qh, 0, 0);
1278            __builtin_prefetch(q8, 0, 0);
1279
1280            vector signed char qxs0 = (vector signed char)vec_xl( 0, q6);
1281            vector signed char qxs1 = (vector signed char)vec_xl(16, q6);
1282            vector signed char qxs2 = (vector signed char)vec_xl(32, q6);
1283            vector signed char qxs3 = (vector signed char)vec_xl(48, q6);
1284            q6 += 64;
1285
1286            vector signed char qxs00 = vec_and(qxs0, lowMask);
1287            vector signed char qxs01 = vec_sr(qxs0, v4);
1288            vector signed char qxs10 = vec_and(qxs1, lowMask);
1289            vector signed char qxs11 = vec_sr(qxs1, v4);
1290            vector signed char qxs20 = vec_and(qxs2, lowMask);
1291            vector signed char qxs21 = vec_sr(qxs2, v4);
1292            vector signed char qxs30 = vec_and(qxs3, lowMask);
1293            vector signed char qxs31 = vec_sr(qxs3, v4);
1294
1295            vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh);
1296            vector signed char qxhs1 = (vector signed char)vec_xl(16, qh);
1297            qh += 32;
1298
1299            vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
1300            vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
1301            vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4);
1302            vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4);
1303            vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
1304            vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
1305            vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4);
1306            vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4);
1307
1308            vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
1309            vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
1310            vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
1311            vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
1312            vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off);
1313            vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off);
1314            vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off);
1315            vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off);
1316
1317            vector signed char q8y00 = vec_xl(  0, q8);
1318            vector signed char q8y10 = vec_xl( 16, q8);
1319            vector signed char q8y20 = vec_xl( 32, q8);
1320            vector signed char q8y30 = vec_xl( 48, q8);
1321            vector signed char q8y01 = vec_xl( 64, q8);
1322            vector signed char q8y11 = vec_xl( 80, q8);
1323            vector signed char q8y21 = vec_xl( 96, q8);
1324            vector signed char q8y31 = vec_xl(112, q8);
1325            q8 += 128;
1326
1327            vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
1328            vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
1329            vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20));
1330            vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30));
1331            vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
1332            vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
1333            vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21));
1334            vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31));
1335
1336            vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8));
1337            qs += 8;
1338
1339            vector signed short vs0 = vec_splat(vscales, 0);
1340            vector signed short vs1 = vec_splat(vscales, 1);
1341            vector signed short vs2 = vec_splat(vscales, 2);
1342            vector signed short vs3 = vec_splat(vscales, 3);
1343            vector signed short vs4 = vec_splat(vscales, 4);
1344            vector signed short vs5 = vec_splat(vscales, 5);
1345            vector signed short vs6 = vec_splat(vscales, 6);
1346            vector signed short vs7 = vec_splat(vscales, 7);
1347
1348            vsumi0 = vec_msum(qv00, vs0, vsumi0);
1349            vsumi1 = vec_msum(qv01, vs4, vsumi1);
1350            vsumi2 = vec_msum(qv10, vs1, vsumi2);
1351            vsumi3 = vec_msum(qv11, vs5, vsumi3);
1352            vsumi4 = vec_msum(qv20, vs2, vsumi4);
1353            vsumi5 = vec_msum(qv21, vs6, vsumi5);
1354            vsumi6 = vec_msum(qv30, vs3, vsumi6);
1355            vsumi7 = vec_msum(qv31, vs7, vsumi7);
1356        }
1357
1358        vsumi0 = vec_add(vsumi0, vsumi4);
1359        vsumi1 = vec_add(vsumi1, vsumi5);
1360        vsumi2 = vec_add(vsumi2, vsumi6);
1361        vsumi3 = vec_add(vsumi3, vsumi7);
1362
1363        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1364        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1365        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1366        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1367    }
1368
1369    vsumf0 = vec_add(vsumf0, vsumf2);
1370    vsumf1 = vec_add(vsumf1, vsumf3);
1371
1372    vsumf0 = vec_add(vsumf0, vsumf1);
1373
1374    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1375    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1376
1377    *s = vec_extract(vsumf0, 0);
1378
1379#else
1380    UNUSED(x);
1381    UNUSED(y);
1382    UNUSED(nb);
1383    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1384#endif
1385}
1386
1387#if defined (__POWER9_VECTOR__)
1388static const int8_t keven_signs_q2xs[1024] = {
1389     1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
1390     1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
1391     1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
1392     1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
1393     1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
1394     1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
1395     1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
1396     1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
1397     1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
1398     1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
1399     1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
1400     1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
1401     1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
1402     1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
1403     1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
1404     1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
1405     1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
1406     1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
1407     1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
1408     1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
1409     1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
1410     1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
1411     1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
1412     1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
1413     1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
1414     1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
1415     1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
1416     1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
1417     1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
1418     1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
1419     1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
1420     1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
1421};
1422#endif
1423
1424void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1425    assert(n % QK_K == 0);
1426    assert(nrc == 1);
1427    UNUSED(nrc);
1428    UNUSED(bx);
1429    UNUSED(by);
1430    UNUSED(bs);
1431
1432    const block_iq2_xxs * GGML_RESTRICT x = vx;
1433    const block_q8_K    * GGML_RESTRICT y = vy;
1434
1435    const int nb = n / QK_K;
1436
1437#if defined(__POWER9_VECTOR__)
1438    const vector int v0 = vec_splats((int32_t)0);
1439    vector float vsumf0 = vec_splats(0.0f);
1440    vector float vsumf1 = vec_splats(0.0f);
1441    vector float vsumf2 = vec_splats(0.0f);
1442    vector float vsumf3 = vec_splats(0.0f);
1443
1444    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1445
1446    for (int i = 0; i < nb; ++i) {
1447        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1448        vector float vyd = vec_splats(y[i].d);
1449        vector float vd = vec_mul(vxd, vyd);
1450
1451        vector signed int vsumi0 = v0;
1452        vector signed int vsumi1 = v0;
1453        vector signed int vsumi2 = v0;
1454        vector signed int vsumi3 = v0;
1455
1456        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1457        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
1458
1459        for (int j = 0; j < QK_K/32; j += 2) {
1460            __builtin_prefetch(q2, 0, 1);
1461            __builtin_prefetch(q8, 0, 1);
1462
1463            uint32_t aux32[4];
1464            const uint8_t * aux8 = (const uint8_t *)aux32;
1465
1466            memcpy(aux32, q2, 4*sizeof(uint32_t));
1467            q2 += 8;
1468
1469            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])};
1470            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])};
1471            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])};
1472            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])};
1473
1474            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >>  7) & 127))};
1475            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))};
1476            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >>  7) & 127))};
1477            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))};
1478
1479            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
1480            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
1481            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
1482            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
1483
1484            vector signed char q8y0 = vec_xl( 0, q8);
1485            vector signed char q8y1 = vec_xl(16, q8);
1486            vector signed char q8y2 = vec_xl(32, q8);
1487            vector signed char q8y3 = vec_xl(48, q8);
1488            q8 += 64;
1489
1490            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
1491            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
1492            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
1493            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
1494
1495            const uint16_t ls0 = aux32[1] >> 28;
1496            const uint16_t ls1 = aux32[3] >> 28;
1497
1498            vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
1499            vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
1500
1501            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
1502            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
1503            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
1504            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
1505        }
1506
1507        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1508        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1509        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1510        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1511    }
1512
1513    vsumf0 = vec_add(vsumf0, vsumf2);
1514    vsumf1 = vec_add(vsumf1, vsumf3);
1515
1516    vsumf0 = vec_add(vsumf0, vsumf1);
1517
1518    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1519    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1520
1521    *s = 0.125f * vec_extract(vsumf0, 0);
1522
1523#else
1524    UNUSED(x);
1525    UNUSED(y);
1526    UNUSED(nb);
1527    ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1528#endif
1529}
1530
1531void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1532    assert(n % QK_K == 0);
1533    assert(nrc == 1);
1534    UNUSED(nrc);
1535    UNUSED(bx);
1536    UNUSED(by);
1537    UNUSED(bs);
1538
1539    const block_iq2_xs * GGML_RESTRICT x = vx;
1540    const block_q8_K   * GGML_RESTRICT y = vy;
1541
1542    const int nb = n / QK_K;
1543
1544#if defined(__POWER9_VECTOR__)
1545    const vector int v0 = vec_splats((int32_t)0);
1546    vector float vsumf0 = vec_splats(0.0f);
1547    vector float vsumf1 = vec_splats(0.0f);
1548    vector float vsumf2 = vec_splats(0.0f);
1549    vector float vsumf3 = vec_splats(0.0f);
1550
1551    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1552
1553    for (int i = 0; i < nb; ++i) {
1554        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1555        vector float vyd = vec_splats(y[i].d);
1556        vector float vd = vec_mul(vxd, vyd);
1557
1558        vector signed int vsumi0 = v0;
1559        vector signed int vsumi1 = v0;
1560        vector signed int vsumi2 = v0;
1561        vector signed int vsumi3 = v0;
1562
1563        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1564        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
1565        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
1566
1567        for (int j = 0; j < QK_K/64; ++j) {
1568            __builtin_prefetch(q2, 0, 1);
1569            __builtin_prefetch(q8, 0, 1);
1570
1571            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))};
1572            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))};
1573            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))};
1574            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))};
1575
1576            vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))};
1577            vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))};
1578            vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))};
1579            vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))};
1580            q2 += 8;
1581
1582            vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
1583            vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
1584            vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
1585            vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
1586
1587            vector signed char q8y0 = vec_xl( 0, q8);
1588            vector signed char q8y1 = vec_xl(16, q8);
1589            vector signed char q8y2 = vec_xl(32, q8);
1590            vector signed char q8y3 = vec_xl(48, q8);
1591            q8 += 64;
1592
1593            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
1594            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
1595            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
1596            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
1597
1598            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
1599            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
1600            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
1601            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
1602            sc += 2;
1603
1604            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
1605            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
1606            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
1607            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
1608
1609            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
1610            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
1611            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
1612            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
1613        }
1614
1615        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1616        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1617        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1618        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1619    }
1620
1621    vsumf0 = vec_add(vsumf0, vsumf2);
1622    vsumf1 = vec_add(vsumf1, vsumf3);
1623
1624    vsumf0 = vec_add(vsumf0, vsumf1);
1625
1626    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1627    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1628
1629    *s = 0.125f * vec_extract(vsumf0, 0);
1630
1631#else
1632    UNUSED(x);
1633    UNUSED(y);
1634    UNUSED(nb);
1635    ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1636#endif
1637}
1638
1639void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1640    assert(n % QK_K == 0);
1641    assert(nrc == 1);
1642    UNUSED(nrc);
1643    UNUSED(bx);
1644    UNUSED(by);
1645    UNUSED(bs);
1646
1647    const block_iq2_s * GGML_RESTRICT x = vx;
1648    const block_q8_K  * GGML_RESTRICT y = vy;
1649
1650    const int nb = n / QK_K;
1651
1652#if defined(__POWER9_VECTOR__)
1653    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
1654                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
1655    };
1656
1657    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
1658
1659    const vector int v0 = vec_splats((int32_t)0);
1660
1661    vector float vsumf0 = vec_splats(0.0f);
1662    vector float vsumf1 = vec_splats(0.0f);
1663    vector float vsumf2 = vec_splats(0.0f);
1664    vector float vsumf3 = vec_splats(0.0f);
1665
1666    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
1667    const vector unsigned char mask1 = vec_xl(16, k_mask1);
1668    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
1669
1670    for (int i = 0; i < nb; ++i) {
1671        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1672        vector float vyd = vec_splats(y[i].d);
1673        vector float vd = vec_mul(vxd, vyd);
1674
1675        vector signed int vsumi0 = v0;
1676        vector signed int vsumi1 = v0;
1677        vector signed int vsumi2 = v0;
1678        vector signed int vsumi3 = v0;
1679
1680        const uint8_t *  GGML_RESTRICT q2 = x[i].qs;
1681        const uint8_t *  GGML_RESTRICT qh = x[i].qh;
1682        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
1683        const uint8_t *  GGML_RESTRICT sc = x[i].scales;
1684        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
1685
1686        for (int j = 0; j < QK_K/32; j += 2) {
1687            __builtin_prefetch(q2, 0, 1);
1688            __builtin_prefetch(q8, 0, 1);
1689
1690            vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))};
1691            vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))};
1692            vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))};
1693            vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))};
1694            q2 += 8;
1695            qh += 2;
1696
1697            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
1698            vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
1699            signs += 4;
1700
1701            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
1702            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
1703            vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0);
1704            vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1);
1705
1706            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
1707            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
1708            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
1709            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
1710
1711            vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0);
1712            vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1);
1713            vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2);
1714            vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3);
1715
1716            vector signed char q8y0 = vec_xl( 0, q8);
1717            vector signed char q8y1 = vec_xl(16, q8);
1718            vector signed char q8y2 = vec_xl(32, q8);
1719            vector signed char q8y3 = vec_xl(48, q8);
1720            q8 += 64;
1721
1722            vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
1723            vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
1724            vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
1725            vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
1726
1727            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
1728            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
1729            const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
1730            const uint16_t ls3 = (uint16_t)(sc[1] >>  4);
1731            sc += 2;
1732
1733            vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
1734            vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
1735            vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
1736            vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
1737
1738            vsumi0 = vec_msum(qv0, vscales0, vsumi0);
1739            vsumi1 = vec_msum(qv1, vscales1, vsumi1);
1740            vsumi2 = vec_msum(qv2, vscales2, vsumi2);
1741            vsumi3 = vec_msum(qv3, vscales3, vsumi3);
1742        }
1743
1744        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1745        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1746        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1747        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1748    }
1749
1750    vsumf0 = vec_add(vsumf0, vsumf2);
1751    vsumf1 = vec_add(vsumf1, vsumf3);
1752
1753    vsumf0 = vec_add(vsumf0, vsumf1);
1754
1755    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1756    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1757
1758    *s = 0.125f * vec_extract(vsumf0, 0);
1759
1760#else
1761    UNUSED(x);
1762    UNUSED(y);
1763    UNUSED(nb);
1764    ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1765#endif
1766}
1767
1768void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1769    assert(n % QK_K == 0);
1770    assert(nrc == 1);
1771    UNUSED(nrc);
1772    UNUSED(bx);
1773    UNUSED(by);
1774    UNUSED(bs);
1775
1776    const block_iq3_xxs * GGML_RESTRICT x = vx;
1777    const block_q8_K    * GGML_RESTRICT y = vy;
1778
1779    const int nb = n / QK_K;
1780
1781#if defined(__POWER9_VECTOR__)
1782    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1783
1784    const vector int v0 = vec_splats((int32_t)0);
1785
1786    vector float vsumf0 = vec_splats(0.0f);
1787    vector float vsumf1 = vec_splats(0.0f);
1788    vector float vsumf2 = vec_splats(0.0f);
1789    vector float vsumf3 = vec_splats(0.0f);
1790
1791    for (int i = 0; i < nb; ++i) {
1792        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1793        vector float vyd = vec_splats(y[i].d);
1794        vector float vd = vec_mul(vxd, vyd);
1795
1796        vector signed int vsumi0 = v0;
1797        vector signed int vsumi1 = v0;
1798        vector signed int vsumi2 = v0;
1799        vector signed int vsumi3 = v0;
1800
1801        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1802        const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
1803        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
1804
1805#pragma GCC unroll 1
1806        for (int j = 0; j < QK_K/32; j += 2) {
1807            __builtin_prefetch(q3, 0, 1);
1808            __builtin_prefetch(q8, 0, 1);
1809
1810            vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
1811            vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
1812            vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
1813            vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
1814            q3 += 16;
1815
1816            vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >>  0) & 127]), (uint64_t)(signs64[(signs[0] >>  7) & 127])};
1817            vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])};
1818            vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >>  0) & 127]), (uint64_t)(signs64[(signs[1] >>  7) & 127])};
1819            vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])};
1820
1821            vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0);
1822            vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1);
1823            vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2);
1824            vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3);
1825
1826            vector signed char q8y0 = vec_xl( 0, q8);
1827            vector signed char q8y1 = vec_xl(16, q8);
1828            vector signed char q8y2 = vec_xl(32, q8);
1829            vector signed char q8y3 = vec_xl(48, q8);
1830            q8 += 64;
1831
1832            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
1833            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
1834            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
1835            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
1836
1837            const uint16_t ls0 = (uint16_t)(signs[0] >> 28);
1838            const uint16_t ls1 = (uint16_t)(signs[1] >> 28);
1839            signs += 2;
1840
1841            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
1842            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
1843
1844            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
1845            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
1846            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
1847            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
1848        }
1849
1850        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1851        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1852        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1853        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1854    }
1855
1856    vsumf0 = vec_add(vsumf0, vsumf2);
1857    vsumf1 = vec_add(vsumf1, vsumf3);
1858
1859    vsumf0 = vec_add(vsumf0, vsumf1);
1860
1861    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1862    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1863
1864    *s = 0.25f * vec_extract(vsumf0, 0);
1865
1866#else
1867    UNUSED(x);
1868    UNUSED(y);
1869    UNUSED(nb);
1870    ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1871#endif
1872}
1873
1874void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1875    assert(n % QK_K == 0);
1876    assert(nrc == 1);
1877    UNUSED(nrc);
1878    UNUSED(bx);
1879    UNUSED(by);
1880    UNUSED(bs);
1881
1882    const block_iq3_s * GGML_RESTRICT x = vx;
1883    const block_q8_K  * GGML_RESTRICT y = vy;
1884
1885    const int nb = n / QK_K;
1886
1887#if defined(__POWER9_VECTOR__)
1888    static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
1889                                        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
1890    };
1891
1892    static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
1893
1894    const vector int v0 = vec_splats((int32_t)0);
1895
1896    vector float vsumf0 = vec_splats(0.0f);
1897    vector float vsumf1 = vec_splats(0.0f);
1898    vector float vsumf2 = vec_splats(0.0f);
1899    vector float vsumf3 = vec_splats(0.0f);
1900
1901    const vector unsigned char mask0 = vec_xl( 0, k_mask1);
1902    const vector unsigned char mask1 = vec_xl(16, k_mask1);
1903    const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
1904
1905    for (int i = 0; i < nb; ++i) {
1906        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1907        vector float vyd = vec_splats(y[i].d);
1908        vector float vd = vec_mul(vxd, vyd);
1909
1910        const uint8_t *  GGML_RESTRICT q3 = x[i].qs;
1911        const uint8_t *  GGML_RESTRICT qh = x[i].qh;
1912        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
1913        const uint8_t *  GGML_RESTRICT sc = x[i].scales;
1914        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
1915
1916        vector signed int vsumi0 = v0;
1917        vector signed int vsumi1 = v0;
1918        vector signed int vsumi2 = v0;
1919        vector signed int vsumi3 = v0;
1920
1921        for (int j = 0; j < QK_K/32; j += 2) {
1922            __builtin_prefetch(q3, 0, 1);
1923            __builtin_prefetch(q8, 0, 1);
1924
1925            vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)],
1926                                             iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]};
1927            vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)],
1928                                             iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]};
1929            vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)],
1930                                             iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]};
1931            vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)],
1932                                             iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]};
1933            q3 += 16;
1934            qh += 2;
1935
1936            vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
1937            vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
1938            signs += 4;
1939
1940            vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
1941            vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
1942            vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0);
1943            vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1);
1944
1945            vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
1946            vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
1947            vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
1948            vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
1949
1950            vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0);
1951            vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1);
1952            vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2);
1953            vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3);
1954
1955            vector signed char q8y0 = vec_xl( 0, q8);
1956            vector signed char q8y1 = vec_xl(16, q8);
1957            vector signed char q8y2 = vec_xl(32, q8);
1958            vector signed char q8y3 = vec_xl(48, q8);
1959            q8 += 64;
1960
1961            vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
1962            vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
1963            vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
1964            vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
1965
1966            const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
1967            const uint16_t ls1 = (uint16_t)(sc[0] >>  4);
1968            sc ++;
1969
1970            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
1971            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
1972
1973            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
1974            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
1975            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
1976            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
1977        }
1978
1979        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
1980        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
1981        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
1982        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
1983    }
1984
1985    vsumf0 = vec_add(vsumf0, vsumf2);
1986    vsumf1 = vec_add(vsumf1, vsumf3);
1987
1988    vsumf0 = vec_add(vsumf0, vsumf1);
1989
1990    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
1991    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
1992
1993    *s = vec_extract(vsumf0, 0);
1994
1995#else
1996    UNUSED(x);
1997    UNUSED(y);
1998    UNUSED(nb);
1999    ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2000#endif
2001}
2002
2003void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2004    assert(n % QK_K == 0);
2005    assert(nrc == 1);
2006    UNUSED(nrc);
2007    UNUSED(bx);
2008    UNUSED(by);
2009    UNUSED(bs);
2010
2011    const block_iq1_s * GGML_RESTRICT x = vx;
2012    const block_q8_K  * GGML_RESTRICT y = vy;
2013
2014    const int nb = n / QK_K;
2015
2016#if defined(__POWER9_VECTOR__)
2017    const vector unsigned char v0 = vec_splats((unsigned char)0x0);
2018    const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
2019
2020    vector float vsumf0 = vec_splats(0.0f);
2021    vector float vsumf1 = vec_splats(0.0f);
2022    vector float vsumf2 = vec_splats(0.0f);
2023    vector float vsumf3 = vec_splats(0.0f);
2024
2025    for (int i = 0; i < nb; ++i) {
2026        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
2027        vector float vyd = vec_splats(y[i].d);
2028        vector float vd = vec_mul(vxd, vyd);
2029
2030        vector signed int vsumi0 = vec_splats((int32_t)0);
2031        vector signed int vsumi1 = vec_splats((int32_t)0);
2032        vector signed int vsumi2 = vec_splats((int32_t)0);
2033        vector signed int vsumi3 = vec_splats((int32_t)0);
2034        vector signed int vsumi8 = vec_splats((int32_t)0);
2035
2036        const uint8_t  * GGML_RESTRICT q1 = x[i].qs;
2037        const uint16_t * GGML_RESTRICT qh = x[i].qh;
2038        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
2039        const int16_t  * GGML_RESTRICT qs = y[i].bsums;
2040
2041        for (int j = 0; j < QK_K/32; j += 2) {
2042            __builtin_prefetch(q1, 0, 1);
2043            __builtin_prefetch(qh, 0, 1);
2044            __builtin_prefetch(q8, 0, 1);
2045
2046            vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))};
2047            vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))};
2048            vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))};
2049            vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))};
2050            q1 += 8;
2051
2052            vector signed char q1x0 = (vector signed char)aux64x2_0;
2053            vector signed char q1x1 = (vector signed char)aux64x2_1;
2054            vector signed char q1x2 = (vector signed char)aux64x2_2;
2055            vector signed char q1x3 = (vector signed char)aux64x2_3;
2056
2057            vector signed char q8y0 = vec_xl( 0, q8);
2058            vector signed char q8y1 = vec_xl(16, q8);
2059            vector signed char q8y2 = vec_xl(32, q8);
2060            vector signed char q8y3 = vec_xl(48, q8);
2061            q8 += 64;
2062
2063            vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0));
2064            vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1));
2065            vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2));
2066            vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3));
2067
2068            const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7);
2069            const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7);
2070
2071            vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
2072            vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
2073            vector signed short vscales = vec_sld(vscales23, vscales01, 8);
2074
2075            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
2076            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
2077            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
2078            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
2079
2080            vector signed short q8ysums = vec_xl_len(qs, 8);
2081            qs += 4;
2082            q8ysums = vec_mergeh(q8ysums, (vector signed short)v0);
2083
2084            vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8);
2085            qh += 2;
2086            vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0);
2087
2088            vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel);
2089
2090            vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
2091        }
2092
2093        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
2094        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
2095        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
2096        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
2097
2098        vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0);
2099    }
2100
2101    vsumf0 = vec_add(vsumf0, vsumf2);
2102    vsumf1 = vec_add(vsumf1, vsumf3);
2103
2104    vsumf0 = vec_add(vsumf0, vsumf1);
2105
2106    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
2107    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
2108
2109    *s = vec_extract(vsumf0, 0);
2110
2111#else
2112    UNUSED(x);
2113    UNUSED(y);
2114    UNUSED(nb);
2115    ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2116#endif
2117}
2118
2119void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2120    assert(nrc == 1);
2121    UNUSED(nrc);
2122    UNUSED(bx);
2123    UNUSED(by);
2124    UNUSED(bs);
2125    assert(n % QK4_NL == 0);
2126    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
2127
2128    const block_iq4_nl * GGML_RESTRICT x = vx;
2129    const block_q8_0   * GGML_RESTRICT y = vy;
2130
2131    const int nb = n / QK4_NL;
2132
2133    int ib = 0;
2134    float sumf = 0;
2135
2136#if defined(__POWER9_VECTOR__)
2137    const vector signed char lowMask = vec_splats((signed char)0xF);
2138    const vector signed int v0 = vec_splats((int32_t)0);
2139    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
2140
2141    vector float vsumf0 = vec_splats(0.0f);
2142    vector float vsumf1 = vec_splats(0.0f);
2143
2144    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
2145
2146#pragma GCC unroll 4
2147    for (; ib < nb; ++ib) {
2148        __builtin_prefetch(x[ib].qs, 0, 1);
2149        __builtin_prefetch(y[ib].qs, 0, 1);
2150
2151
2152        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
2153        vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
2154        vector float vd = vec_mul(vxd, vyd);
2155
2156        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
2157        vector signed char q4x0 = vec_and(qxs, lowMask);
2158        vector signed char q4x1 = vec_sr(qxs, v4);
2159
2160        q4x0 = vec_perm(values, values, (vector unsigned char)q4x0);
2161        q4x1 = vec_perm(values, values, (vector unsigned char)q4x1);
2162
2163        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
2164        vector signed char q8y1 = vec_xl(16, y[ib].qs);
2165
2166        vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
2167        vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
2168
2169        vector signed int vsumi0 = v0;
2170        vector signed int vsumi1 = v0;
2171
2172        vsumi0 = vec_sum4s(qv0, vsumi0);
2173        vsumi1 = vec_sum4s(qv1, vsumi1);
2174
2175        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
2176        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
2177    }
2178
2179    vsumf0 = vec_add(vsumf0, vsumf1);
2180
2181    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
2182    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
2183
2184    sumf = vec_extract(vsumf0, 0);
2185
2186    *s = sumf;
2187#else
2188    UNUSED(x);
2189    UNUSED(y);
2190    UNUSED(nb);
2191    UNUSED(ib);
2192    UNUSED(sumf);
2193    ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
2194#endif
2195}
2196
2197void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2198    assert(nrc == 1);
2199    UNUSED(nrc);
2200    UNUSED(bx);
2201    UNUSED(by);
2202    UNUSED(bs);
2203    assert(n % QK_K == 0);
2204
2205    const block_iq4_xs * GGML_RESTRICT x = vx;
2206    const block_q8_K   * GGML_RESTRICT y = vy;
2207
2208    const int nb = n / QK_K;
2209
2210#if defined(__POWER9_VECTOR__)
2211    const vector signed char lowMask = vec_splats((signed char)0xF);
2212    const vector int v0 = vec_splats((int32_t)0);
2213    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
2214
2215    vector float vsumf0 = vec_splats(0.0f);
2216    vector float vsumf1 = vec_splats(0.0f);
2217    vector float vsumf2 = vec_splats(0.0f);
2218    vector float vsumf3 = vec_splats(0.0f);
2219
2220    const vector signed char values = vec_xl( 0, kvalues_iq4nl);
2221
2222    for (int ibl = 0; ibl < nb; ++ibl) {
2223
2224        vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ibl].d));
2225        vector float vyd = vec_splats(y[ibl].d);
2226        vector float vd = vec_mul(vxd, vyd);
2227
2228        vector signed int vsumi0 = v0;
2229        vector signed int vsumi1 = v0;
2230        vector signed int vsumi2 = v0;
2231        vector signed int vsumi3 = v0;
2232
2233        uint16_t h = x[ibl].scales_h;
2234
2235        const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
2236        const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
2237        const int8_t  * GGML_RESTRICT q8 = y[ibl].qs;
2238
2239        for (int ib = 0; ib < QK_K/64; ib ++ ) {
2240            __builtin_prefetch(q4, 0, 1);
2241            __builtin_prefetch(q8, 0, 1);
2242
2243            vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
2244            vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
2245            q4 += 32;
2246
2247            vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask);
2248            vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4);
2249            vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask);
2250            vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4);
2251
2252            q4x00 = vec_perm(values, values, (vector unsigned char)q4x00);
2253            q4x01 = vec_perm(values, values, (vector unsigned char)q4x01);
2254            q4x10 = vec_perm(values, values, (vector unsigned char)q4x10);
2255            q4x11 = vec_perm(values, values, (vector unsigned char)q4x11);
2256
2257            vector signed char q8y0 = vec_xl( 0, q8);
2258            vector signed char q8y1 = vec_xl(16, q8);
2259            vector signed char q8y2 = vec_xl(32, q8);
2260            vector signed char q8y3 = vec_xl(48, q8);
2261            q8 += 64;
2262
2263            vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0));
2264            vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1));
2265            vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2));
2266            vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3));
2267
2268            const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32);
2269            const uint16_t ls1 = (uint16_t)(((sc[0] >>  4) | ((h << 2) & 0x30)) - 32);
2270            h >>= 4;
2271            sc ++;
2272
2273            vector signed short vscales01 = vec_splats((int16_t)ls0);
2274            vector signed short vscales23 = vec_splats((int16_t)ls1);
2275
2276            vsumi0 = vec_msum(qv0, vscales01, vsumi0);
2277            vsumi1 = vec_msum(qv1, vscales01, vsumi1);
2278            vsumi2 = vec_msum(qv2, vscales23, vsumi2);
2279            vsumi3 = vec_msum(qv3, vscales23, vsumi3);
2280        }
2281
2282        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
2283        vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
2284        vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
2285        vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
2286    }
2287
2288    vsumf0 = vec_add(vsumf0, vsumf2);
2289    vsumf1 = vec_add(vsumf1, vsumf3);
2290
2291    vsumf0 = vec_add(vsumf0, vsumf1);
2292
2293    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
2294    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
2295
2296    *s = vec_extract(vsumf0, 0);
2297
2298#else
2299    UNUSED(x);
2300    UNUSED(y);
2301    UNUSED(nb);
2302    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
2303#endif
2304}
2305