1#define GGML_COMMON_IMPL_C
   2#include "ggml-common.h"
   3#include "ggml-quants.h"
   4#include "ggml-impl.h"
   5#include "ggml-cpu.h"
   6#include "simd-mappings.h"
   7
   8#include "../../quants.h"
   9#include "../../ggml-cpu-impl.h"
  10
  11#include <math.h>
  12#include <string.h>
  13#include <assert.h>
  14#include <float.h>
  15#include <stdlib.h> // for qsort
  16#include <stdio.h>  // for GGML_ASSERT
  17
  18#define GROUP_MAX_EPS 1e-15f
  19#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
  20#define GROUP_MAX_EPS_IQ2_S 1e-8f
  21#define GROUP_MAX_EPS_IQ1_M 1e-7f
  22#define GROUP_MAX_EPS_IQ1_S 1e-12f
  23
  24#define UNUSED GGML_UNUSED
  25
  26#if defined(__VXE__) || defined(__VXE2__)
  27#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
  28#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
  29#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
  30#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
  31#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
  32#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
  33#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
  34#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
  35
  36// precomputed tables for expanding 8bits to 8 bytes:
  37static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4
  38static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
  39
  40// permute mask for byteswapping
  41static const uint8x16_t v_kperm = (const uint8x16_t){
  42     7,  6,  5,  4,  3,  2, 1, 0,
  43    15, 14, 13, 12, 11, 10, 9, 8
  44};
  45#endif
  46
  47void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  48    assert(QK8_0 == 32);
  49    assert(k % QK8_0 == 0);
  50    const int nb = k / QK8_0;
  51
  52    block_q8_0 * GGML_RESTRICT y = vy;
  53
  54#if defined(__VXE__) || defined(__VXE2__)
  55    for (int i = 0; i < nb; i++) {
  56        float32x4_t srcv [8];
  57        float32x4_t asrcv[8];
  58        float32x4_t amaxv[8];
  59
  60        for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
  61        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
  62        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
  63        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
  64        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
  65
  66        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
  67                                   vec_extract(amaxv[0], 1)),
  68                               MAX(vec_extract(amaxv[0], 2),
  69                                   vec_extract(amaxv[0], 3)));
  70
  71        const float d = amax / ((1 << 7) - 1);
  72        const float id = d ? 1.0f / d : 0.0f;
  73
  74        y[i].d = GGML_CPU_FP32_TO_FP16(d);
  75
  76        for (int j = 0; j < 8; j++) {
  77            const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
  78            /* Uses non-default rounding for vec_signed or vec_round */
  79            const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
  80
  81            y[i].qs[4*j + 0] = vec_extract(vi, 0);
  82            y[i].qs[4*j + 1] = vec_extract(vi, 1);
  83            y[i].qs[4*j + 2] = vec_extract(vi, 2);
  84            y[i].qs[4*j + 3] = vec_extract(vi, 3);
  85        }
  86    }
  87#else
  88    GGML_UNUSED(nb);
  89    // scalar
  90    quantize_row_q8_0_ref(x, y, k);
  91#endif
  92}
  93
  94void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  95    assert(k % QK8_1 == 0);
  96    const int nb = k / QK8_1;
  97
  98    block_q8_1 * GGML_RESTRICT y = vy;
  99
 100#if defined(__VXE__) || defined(__VXE2__)
 101    for (int i = 0; i < nb; i++) {
 102        float32x4_t srcv [8];
 103        float32x4_t asrcv[8];
 104        float32x4_t amaxv[8];
 105
 106        for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
 107        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
 108        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
 109        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
 110        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
 111
 112        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
 113                                   vec_extract(amaxv[0], 1)),
 114                               MAX(vec_extract(amaxv[0], 2),
 115                                   vec_extract(amaxv[0], 3)));
 116
 117        const float d = amax / ((1 << 7) - 1);
 118        const float id = d ? 1.0f / d : 0.0f;
 119
 120        y[i].d = GGML_CPU_FP32_TO_FP16(d);
 121
 122        int32x4_t acc = vec_splats(0);
 123
 124        for (int j = 0; j < 8; j++) {
 125            const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
 126            /* Uses non-default rounding for vec_signed or vec_round */
 127            const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
 128
 129            y[i].qs[4*j + 0] = vec_extract(vi, 0);
 130            y[i].qs[4*j + 1] = vec_extract(vi, 1);
 131            y[i].qs[4*j + 2] = vec_extract(vi, 2);
 132            y[i].qs[4*j + 3] = vec_extract(vi, 3);
 133
 134            acc = vec_add(acc, vi);
 135        }
 136
 137        y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
 138    }
 139#else
 140    GGML_UNUSED(nb);
 141    // scalar
 142    quantize_row_q8_1_ref(x, y, k);
 143#endif
 144}
 145
 146
 147//===================================== Dot products =================================
 148
 149void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 150    const int qk = QK8_0;
 151    const int nb = n / qk;
 152
 153    assert(n % qk == 0);
 154    assert(nrc == 1);
 155    UNUSED(nrc);
 156    UNUSED(bx);
 157    UNUSED(by);
 158    UNUSED(bs);
 159
 160    const block_q4_0 * GGML_RESTRICT x = vx;
 161    const block_q8_0 * GGML_RESTRICT y = vy;
 162
 163    int ib = 0;
 164    float sumf = 0;
 165
 166#if defined(__VXE__) || defined(__VXE2__)
 167    float32x4_t acc = vec_splats(0.0f);
 168
 169    const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
 170    const int8x16_t  v_s = vec_splats( (const int8_t)0x08);
 171
 172    for (; ib < nb; ++ib) {
 173        const uint8x16_t v_x = vec_xl(0, x[ib].qs);
 174        const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
 175        const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
 176
 177        const int8x16_t v_xls = vec_sub(v_xl, v_s);
 178        const int8x16_t v_xhs = vec_sub(v_xh, v_s);
 179
 180        const int8x16_t v_yl = vec_xl(0      , y[ib].qs);
 181        const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
 182
 183        const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
 184        const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
 185        const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
 186        const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
 187
 188        int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
 189
 190        const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
 191        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
 192
 193        acc = vec_madd(v_xy, v_d, acc);
 194    }
 195
 196    sumf = vec_hsum_f32x4(acc);
 197    *s = sumf;
 198#else
 199    UNUSED(nb);
 200    UNUSED(x);
 201    UNUSED(y);
 202    UNUSED(ib);
 203    UNUSED(sumf);
 204    ggml_vec_dot_q4_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 205#endif
 206}
 207
 208void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 209    const int qk = QK8_1;
 210    const int nb = n / qk;
 211
 212    assert(n % qk == 0);
 213    assert(nrc == 1);
 214    UNUSED(nrc);
 215    UNUSED(bx);
 216    UNUSED(by);
 217    UNUSED(bs);
 218
 219    const block_q4_1 * GGML_RESTRICT x = vx;
 220    const block_q8_1 * GGML_RESTRICT y = vy;
 221
 222    int ib = 0;
 223    float sumf = 0;
 224
 225#if defined(__VXE__) || defined(__VXE2__)
 226    float summs = 0;
 227    float32x4_t acc = vec_splats(0.0f);
 228
 229    const uint8x16_t v_m = vec_splat_u8(0x0F);
 230
 231#pragma GCC unroll 4
 232    for (; ib < nb; ++ib) {
 233        __builtin_prefetch(x[ib].qs, 0, 1);
 234        __builtin_prefetch(y[ib].qs, 0, 1);
 235
 236        summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
 237
 238        const uint8x16_t v_x = vec_xl(0, x[ib].qs);
 239        const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
 240        const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
 241
 242        const int8x16_t v_yl = vec_xl(0      , y[ib].qs);
 243        const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
 244
 245        const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
 246        const float32x4_t v_xy = vec_float(v_xy_);
 247
 248        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
 249
 250        acc = vec_madd(v_xy, v_d, acc);
 251    }
 252
 253    sumf = vec_hsum_f32x4(acc) + summs;
 254    *s = sumf;
 255#else
 256    UNUSED(nb);
 257    UNUSED(x);
 258    UNUSED(y);
 259    UNUSED(ib);
 260    UNUSED(sumf);
 261    ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 262#endif
 263}
 264
 265void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 266    assert(nrc == 1);
 267    UNUSED(nrc);
 268    UNUSED(bx);
 269    UNUSED(by);
 270    UNUSED(bs);
 271    assert(n % QK_MXFP4 == 0);
 272    static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
 273
 274    const int qk = QK_MXFP4;
 275    const int nb = n / qk;
 276
 277    const block_mxfp4 * GGML_RESTRICT x = vx;
 278    const block_q8_0  * GGML_RESTRICT y = vy;
 279
 280    int ib = 0;
 281    float sumf = 0.0f;
 282
 283#if defined(__VXE__) || defined(__VXE2__)
 284    const int8x16_t  v_k = vec_xl(0, kvalues_mxfp4);
 285    const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
 286
 287    float32x4_t v_acc = vec_splats(0.0f);
 288
 289    #pragma GCC unroll 8
 290    for (; ib + 1 < nb; ib += 2) {
 291        const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
 292        const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
 293        const block_q8_0  * GGML_RESTRICT y0 = &y[ib + 0];
 294        const block_q8_0  * GGML_RESTRICT y1 = &y[ib + 1];
 295
 296        const uint8x16_t v_x0 = vec_xl(0, x0->qs);
 297        const uint8x16_t v_x1 = vec_xl(0, x1->qs);
 298
 299        int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
 300        int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
 301        int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
 302        int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
 303
 304        v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
 305        v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
 306        v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
 307        v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
 308
 309        const int8x16_t v_y0l = vec_xl(0,       y0->qs);
 310        const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
 311        const int8x16_t v_y1l = vec_xl(0,       y1->qs);
 312        const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
 313
 314        const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
 315        const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
 316
 317        const float32x4_t v_xy0f = vec_float(v_xy0);
 318        const float32x4_t v_xy1f = vec_float(v_xy1);
 319
 320        const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
 321        const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
 322
 323        v_acc = vec_madd(v_xy0f, v_d0, v_acc);
 324        v_acc = vec_madd(v_xy1f, v_d1, v_acc);
 325    }
 326
 327    for (; ib < nb; ++ib) {
 328        const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
 329        const block_q8_0  * GGML_RESTRICT y0 = &y[ib + 0];
 330
 331        const uint8x16_t v_x = vec_xl(0, x0->qs);
 332
 333        int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
 334        int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
 335
 336        v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
 337        v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
 338
 339        const int8x16_t v_yl = vec_xl(0,       y0->qs);
 340        const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
 341
 342        const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
 343        const float32x4_t v_xyf = vec_float(v_xy);
 344
 345        const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
 346        v_acc = vec_madd(v_xyf, v_d, v_acc);
 347    }
 348
 349    sumf = vec_hsum_f32x4(v_acc);
 350    *s = sumf;
 351#else
 352    UNUSED(x);
 353    UNUSED(y);
 354    UNUSED(ib);
 355    UNUSED(sumf);
 356    ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 357#endif
 358}
 359
 360void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 361    const int qk = QK8_0;
 362    const int nb = n / qk;
 363
 364    assert(n % qk == 0);
 365    assert(qk == QK5_0);
 366    assert(nrc == 1);
 367    UNUSED(nrc);
 368    UNUSED(bx);
 369    UNUSED(by);
 370    UNUSED(bs);
 371
 372    const block_q5_0 * GGML_RESTRICT x = vx;
 373    const block_q8_0 * GGML_RESTRICT y = vy;
 374
 375    int ib = 0;
 376    float sumf = 0.0f;
 377
 378#if defined(__VXE__) || defined(__VXE2__)
 379    float32x4_t v_sum0 = vec_splats(0.0f);
 380    float32x4_t v_sum1 = vec_splats(0.0f);
 381
 382    uint32_t qh0, qh1;
 383    uint64_t tmp0[4], tmp1[4];
 384
 385    const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
 386
 387    #pragma GCC unroll 4
 388    for (; ib + 1 < nb; ib += 2) {
 389        const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0];
 390        const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
 391        const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
 392        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 393
 394        memcpy(&qh0, x0->qh, sizeof(qh0));
 395        memcpy(&qh1, x1->qh, sizeof(qh1));
 396
 397        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
 398        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
 399        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
 400        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
 401
 402        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
 403        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
 404        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
 405        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
 406
 407        int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
 408        int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
 409        int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
 410        int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
 411
 412        // required for fixing the byteorder
 413        v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
 414        v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
 415        v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
 416        v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
 417
 418        const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs);
 419        const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs);
 420
 421        int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
 422        int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
 423        int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
 424        int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
 425
 426        const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l);
 427        const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h);
 428        const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l);
 429        const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h);
 430
 431        const int8x16_t v_y0l = vec_xl(0,       (const int8_t *)y0->qs);
 432        const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
 433        const int8x16_t v_y1l = vec_xl(0,       (const int8_t *)y1->qs);
 434        const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs);
 435
 436        const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
 437        const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
 438
 439        const float32x4_t v_xy0f = vec_float(v_xy0);
 440        const float32x4_t v_xy1f = vec_float(v_xy1);
 441
 442        const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
 443        const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
 444
 445        v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
 446        v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
 447    }
 448
 449    sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
 450
 451    #pragma GCC unroll 4
 452    for (; ib < nb; ++ib) {
 453        const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
 454        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
 455
 456        uint32_t qh;
 457        memcpy(&qh, x0->qh, sizeof(qh));
 458
 459        uint64_t tmp[4];
 460        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
 461        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
 462        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
 463        tmp[3] = table_b2b_1[(qh >> 24)       ];
 464
 465        int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
 466        int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
 467
 468        // required for fixing the byteorder
 469        v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
 470        v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
 471
 472        const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs);
 473        int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
 474        int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
 475
 476        const int8x16_t v_xlf = vec_sub(v_xl, v_qhl);
 477        const int8x16_t v_xhf = vec_sub(v_xh, v_qhh);
 478
 479        const int8x16_t v_yl = vec_xl(0,       (const int8_t *)y0->qs);
 480        const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs);
 481
 482        const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
 483        const float32x4_t v_xyf = vec_float(v_xy);
 484
 485        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
 486        const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
 487
 488        sumf += vec_hsum_f32x4(v_acc);
 489    }
 490
 491    *s = sumf;
 492#else
 493    UNUSED(nb);
 494    UNUSED(x);
 495    UNUSED(y);
 496    UNUSED(ib);
 497    UNUSED(sumf);
 498    ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 499#endif
 500}
 501
 502void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 503    const int qk = QK8_1;
 504    const int nb = n / qk;
 505
 506    assert(n % qk == 0);
 507    assert(qk == QK5_1);
 508    assert(nrc == 1);
 509    UNUSED(nrc);
 510    UNUSED(bx);
 511    UNUSED(by);
 512    UNUSED(bs);
 513
 514    const block_q5_1 * GGML_RESTRICT x = vx;
 515    const block_q8_1 * GGML_RESTRICT y = vy;
 516
 517    int ib = 0;
 518    float sumf = 0.0f;
 519
 520#if defined(__VXE__) || defined(__VXE2__)
 521    float32x4_t v_sum0 = vec_splats(0.0f);
 522    float32x4_t v_sum1 = vec_splats(0.0f);
 523
 524    float summs0 = 0.0f;
 525    float summs1 = 0.0f;
 526
 527    uint32_t qh0;
 528    uint32_t qh1;
 529
 530    uint64_t tmp0[4];
 531    uint64_t tmp1[4];
 532
 533    const uint8x16_t v_m = vec_splats((uint8_t)0x0F);
 534
 535    #pragma GCC unroll 4
 536    for (; ib + 1 < nb; ib += 2) {
 537        const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0];
 538        const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
 539        const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
 540        const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
 541
 542        summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
 543        summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
 544
 545        memcpy(&qh0, x0->qh, sizeof(qh0));
 546        memcpy(&qh1, x1->qh, sizeof(qh1));
 547
 548        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
 549        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
 550        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
 551        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
 552
 553        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
 554        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
 555        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
 556        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
 557
 558        int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0));
 559        int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2));
 560        int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0));
 561        int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2));
 562
 563        // required for fixing the byteorder
 564        v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm);
 565        v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm);
 566        v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm);
 567        v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm);
 568
 569        const uint8x16_t v_x0 = vec_xl(0, x0->qs);
 570        const uint8x16_t v_x1 = vec_xl(0, x1->qs);
 571
 572        const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
 573        const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
 574        const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
 575        const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
 576
 577        const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l);
 578        const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h);
 579        const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l);
 580        const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h);
 581
 582        const int8x16_t v_y0l = vec_xl(0      , y0->qs);
 583        const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs);
 584        const int8x16_t v_y1l = vec_xl(0      , y1->qs);
 585        const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs);
 586
 587        const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h);
 588        const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h);
 589
 590        const float32x4_t v_xy0f = vec_float(v_xy0);
 591        const float32x4_t v_xy1f = vec_float(v_xy1);
 592
 593        const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
 594        const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d));
 595
 596        v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0);
 597        v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
 598    }
 599
 600    sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
 601
 602    #pragma GCC unroll 4
 603    for (; ib < nb; ++ib) {
 604        const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
 605        const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
 606
 607        float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
 608
 609        uint32_t qh;
 610        memcpy(&qh, x0->qh, sizeof(qh));
 611
 612        uint64_t tmp[4];
 613        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
 614        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
 615        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
 616        tmp[3] = table_b2b_0[(qh >> 24)       ];
 617
 618        int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0));
 619        int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2));
 620
 621        // required for fixing the byteorder
 622        v_qhl = vec_perm(v_qhl, v_qhl, v_kperm);
 623        v_qhh = vec_perm(v_qhh, v_qhh, v_kperm);
 624
 625        const uint8x16_t v_x = vec_xl(0, x0->qs);
 626        const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
 627        const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
 628
 629        const int8x16_t v_xlf = vec_or(v_xl, v_qhl);
 630        const int8x16_t v_xhf = vec_or(v_xh, v_qhh);
 631
 632        const int8x16_t v_yl = vec_xl(0      , y0->qs);
 633        const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs);
 634
 635        const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh);
 636        const float32x4_t v_xyf = vec_float(v_xy);
 637
 638        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
 639        const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
 640
 641        sumf += vec_hsum_f32x4(v_acc) + summs;
 642    }
 643
 644    *s = sumf;
 645#else
 646    UNUSED(nb);
 647    UNUSED(x);
 648    UNUSED(y);
 649    UNUSED(ib);
 650    UNUSED(sumf);
 651    ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
 652#endif
 653}
 654
 655void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 656    const int qk = QK8_0;
 657    const int nb = n / qk;
 658
 659    assert(n % qk == 0);
 660    assert(nrc == 1);
 661    UNUSED(nrc);
 662    UNUSED(bx);
 663    UNUSED(by);
 664    UNUSED(bs);
 665
 666    const block_q8_0 * GGML_RESTRICT x = vx;
 667    const block_q8_0 * GGML_RESTRICT y = vy;
 668
 669    int ib = 0;
 670    float sumf = 0;
 671
 672#if defined(__VXE__) || defined(__VXE2__)
 673    float32x4_t acc = vec_splats(0.0f);
 674
 675#pragma GCC unroll 8
 676    for (; ib < nb; ++ib) {
 677        __builtin_prefetch(x[ib].qs, 0, 1);
 678        __builtin_prefetch(y[ib].qs, 0, 1);
 679
 680        const int8x16_t v_xl = vec_xl(0      , x[ib].qs);
 681        const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
 682        const int8x16_t v_yl = vec_xl(0      , y[ib].qs);
 683        const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
 684
 685        const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
 686        const float32x4_t v_xy = vec_float(v_xy_);
 687        const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
 688
 689        acc = vec_madd(v_xy, v_d, acc);
 690    }
 691
 692    sumf = vec_hsum_f32x4(acc);
 693
 694    *s = sumf;
 695#else
 696    UNUSED(nb);
 697    UNUSED(x);
 698    UNUSED(y);
 699    UNUSED(ib);
 700    UNUSED(sumf);
 701    ggml_vec_dot_q8_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
 702#endif
 703}
 704
 705void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 706    assert(n % QK_K == 0);
 707    assert(nrc == 1);
 708    UNUSED(nrc);
 709    UNUSED(bx);
 710    UNUSED(by);
 711    UNUSED(bs);
 712
 713    const uint32_t kmask1 = 0x03030303;
 714    const uint32_t kmask2 = 0x0f0f0f0f;
 715
 716    const block_q3_K * GGML_RESTRICT x = vx;
 717    const block_q8_K * GGML_RESTRICT y = vy;
 718
 719    const int nb = n / QK_K;
 720
 721#if defined(__VXE__) || defined(__VXE2__)
 722    uint32_t aux[3];
 723    uint32_t utmp[4];
 724
 725    const int32x4_t v_z = vec_splat_s32(0);
 726    const uint8x16_t v_3m = vec_splat_u8(0x03);
 727
 728    const uint8x16_t v_0c = vec_splat_u8(1);
 729    const uint8x16_t v_1c = vec_sl(v_0c, 1);
 730    const uint8x16_t v_2c = vec_sl(v_0c, 2);
 731    const uint8x16_t v_3c = vec_sl(v_0c, 3);
 732
 733    uint8x16_t q3h[4];
 734    uint8x16_t q3b[2];
 735    int8x16_t q3bytes[4];
 736    int8x16_t q8bytes[8];
 737    uint8x16_t qhbits[2];
 738
 739    float sum = 0;
 740
 741    for (int i = 0; i < nb; ++i) {
 742        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
 743
 744        const uint8_t * restrict x0l = x[i].qs;
 745        const uint8_t * restrict x0h = x[i].hmask;
 746        const int8_t  * restrict y0  = y[i].qs;
 747
 748        qhbits[0] = vec_xl(0 , x0h);
 749        qhbits[1] = vec_xl(16, x0h);
 750
 751        int32_t isum = 0;
 752
 753        memcpy(aux, x[i].scales, 12);
 754        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
 755        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
 756        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
 757        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
 758
 759        int8_t * scale = (int8_t *)utmp;
 760        for (int j = 0; j < 16; ++j) scale[j] -= 32;
 761
 762        for (int j = 0; j < QK_K/128; ++j) {
 763            int32x4_t isum0, isum1, isum2, isum3;
 764
 765            q3b[0] = vec_xl(0 , x0l);
 766            q3b[1] = vec_xl(16, x0l);
 767            x0l += 32;
 768
 769            q8bytes[0] = vec_xl(0  , y0);
 770            q8bytes[1] = vec_xl(16 , y0);
 771            q8bytes[2] = vec_xl(32 , y0);
 772            q8bytes[3] = vec_xl(48 , y0);
 773            q8bytes[4] = vec_xl(64 , y0);
 774            q8bytes[5] = vec_xl(80 , y0);
 775            q8bytes[6] = vec_xl(96 , y0);
 776            q8bytes[7] = vec_xl(112, y0);
 777            y0 += 128;
 778
 779            q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
 780            q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
 781            q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
 782            q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
 783
 784            q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
 785            q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
 786            q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
 787            q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
 788
 789            isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
 790            isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
 791            isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
 792            isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
 793
 794            isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
 795            isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
 796            isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
 797            isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
 798
 799            scale += 4;
 800
 801            q3h[0] = vec_andc(v_2c, qhbits[0]);
 802            q3h[1] = vec_andc(v_2c, qhbits[1]);
 803            q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
 804            q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
 805
 806            q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
 807            q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
 808            q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
 809            q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
 810
 811            isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
 812            isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
 813            isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
 814            isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
 815
 816            isum += vec_hsum_i32x4(isum0) * scale[0];
 817            isum += vec_hsum_i32x4(isum1) * scale[1];
 818            isum += vec_hsum_i32x4(isum2) * scale[2];
 819            isum += vec_hsum_i32x4(isum3) * scale[3];
 820
 821            scale += 4;
 822
 823            if (j == 0) {
 824                qhbits[0] = vec_sr(qhbits[0], 4);
 825                qhbits[1] = vec_sr(qhbits[1], 4);
 826            }
 827        }
 828
 829        sum += d * isum;
 830    }
 831
 832    *s = sum;
 833
 834#else
 835    UNUSED(kmask1);
 836    UNUSED(kmask2);
 837    UNUSED(x);
 838    UNUSED(y);
 839    UNUSED(nb);
 840    ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
 841#endif
 842}
 843
 844void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
 845    assert(n % QK_K == 0);
 846    assert(nrc == 1);
 847    UNUSED(nrc);
 848    UNUSED(bx);
 849    UNUSED(by);
 850    UNUSED(bs);
 851
 852    const block_q4_K * GGML_RESTRICT x = vx;
 853    const block_q8_K * GGML_RESTRICT y = vy;
 854
 855    const int nb = n / QK_K;
 856
 857    static const uint32_t kmask1 = 0x3f3f3f3f;
 858    static const uint32_t kmask2 = 0x0f0f0f0f;
 859    static const uint32_t kmask3 = 0x03030303;
 860
 861    uint32_t utmp[4];
 862
 863#if defined(__VXE__) || defined(__VXE2__)
 864    const uint8x16_t v_lm = vec_splat_u8(0x0F);
 865    const int32x4_t v_z = vec_splat_s32(0);
 866
 867    uint8x16_t v_x[2];
 868    int8x16_t  v_xl[2];
 869    int8x16_t  v_y[2];
 870
 871    float sumf = 0;
 872
 873    for (int i = 0; i < nb; ++i) {
 874        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
 875        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
 876
 877        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
 878        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
 879        const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
 880
 881        memcpy(utmp, x[i].scales, 12);
 882
 883        uint32x4_t v_mins8 = { 0 };
 884        v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
 885        v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
 886
 887        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
 888        utmp[0] &= kmask1;
 889
 890        const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
 891
 892        const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
 893        const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
 894        const int32x4_t v_mins = v_minso + v_minse;
 895        sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
 896
 897        const uint8_t * scales = (const uint8_t *)utmp;
 898        const uint8_t * GGML_RESTRICT x0 = x[i].qs;
 899        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
 900
 901        int32_t sumi1 = 0;
 902        int32_t sumi2 = 0;
 903
 904        for (int j = 0; j < QK_K/64; ++j) {
 905            v_x[0] = vec_xl(0 , x0);
 906            v_x[1] = vec_xl(16, x0);
 907            x0 += 32;
 908
 909            v_y[0] = vec_xl(0 , y0);
 910            v_y[1] = vec_xl(16, y0);
 911            y0 += 32;
 912
 913            v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
 914            v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
 915
 916            const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
 917            sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
 918
 919            v_y[0] = vec_xl(0 , y0);
 920            v_y[1] = vec_xl(16, y0);
 921            y0 += 32;
 922
 923            v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
 924            v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
 925
 926            const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
 927            sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
 928        }
 929
 930        sumf += d * (sumi1 + sumi2);
 931    }
 932
 933    *s = sumf;
 934
 935#else
 936    UNUSED(x);
 937    UNUSED(y);
 938    UNUSED(nb);
 939    UNUSED(kmask1);
 940    UNUSED(kmask2);
 941    UNUSED(kmask3);
 942    UNUSED(utmp);
 943    ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
 944#endif
 945}
 946
 947void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
 948    assert(n % QK_K == 0);
 949    assert(nrc == 1);
 950    UNUSED(nrc);
 951    UNUSED(bx);
 952    UNUSED(by);
 953    UNUSED(bs);
 954
 955    const block_q5_K * GGML_RESTRICT x = vx;
 956    const block_q8_K * GGML_RESTRICT y = vy;
 957
 958    const int nb = n / QK_K;
 959
 960    static const uint32_t kmask1 = 0x3f3f3f3f;
 961    static const uint32_t kmask2 = 0x0f0f0f0f;
 962    static const uint32_t kmask3 = 0x03030303;
 963
 964    uint32_t utmp[4];
 965
 966#if defined(__VXE__) || defined(__VXE2__)
 967    const uint8x16_t v_lm = vec_splat_u8(0x0F);
 968    const uint8x16_t v_1m = vec_splat_u8(0x01);
 969    const uint8x16_t v_2m = vec_splat_u8(0x02);
 970
 971    const int32x4_t v_z = vec_splat_s32(0);
 972
 973    const uchar8x16_t v_minsm = {
 974        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
 975        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
 976    };
 977
 978    int8x16_t  q5b[4];
 979    uint8x16_t q5h[4];
 980
 981    uint8x16_t v_xl[2];
 982    uint8x16_t v_xh[2];
 983    int8x16_t  v_y[4];
 984
 985    float sumf = 0;
 986
 987    for (int i = 0; i < nb; ++i) {
 988        const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
 989        const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
 990
 991        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
 992        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
 993        const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
 994
 995        memcpy(utmp, x[i].scales, 12);
 996        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
 997        const uint32_t uaux = utmp[1] & kmask1;
 998        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
 999        utmp[2] = uaux;
1000        utmp[0] &= kmask1;
1001
1002        const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
1003        const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
1004        const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
1005
1006        const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
1007        const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
1008        const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
1009        const int32_t mins = vec_hsum_i32x4(v_mins);
1010
1011        const uint8_t * scales = (const uint8_t *)utmp;
1012        const uint8_t * GGML_RESTRICT x0l = x[i].qs;
1013        const uint8_t * GGML_RESTRICT x0h = x[i].qh;
1014        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
1015
1016        v_xh[0] = vec_xl(0 , x0h);
1017        v_xh[1] = vec_xl(16, x0h);
1018
1019        int32_t sumi = 0;
1020        for (int j = 0; j < QK_K/64; ++j) {
1021            v_xl[0] = vec_xl(0 , x0l);
1022            v_xl[1] = vec_xl(16, x0l);
1023            x0l += 32;
1024
1025            v_y[0] = vec_xl(0 , y0);
1026            v_y[1] = vec_xl(16, y0);
1027            v_y[2] = vec_xl(32, y0);
1028            v_y[3] = vec_xl(48, y0);
1029            y0 += 64;
1030
1031            q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
1032            q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
1033            q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
1034            q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
1035            v_xh[0] = vec_sr(v_xh[0], 2);
1036            v_xh[1] = vec_sr(v_xh[1], 2);
1037
1038            q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
1039            q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
1040            q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
1041            q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
1042
1043            int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
1044            int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
1045
1046            sumi += vec_hsum_i32x4(sumi0) * *scales++;
1047            sumi += vec_hsum_i32x4(sumi1) * *scales++;
1048        }
1049
1050        sumf += d * sumi - dmin * mins;
1051    }
1052
1053    *s = sumf;
1054
1055#else
1056    UNUSED(x);
1057    UNUSED(y);
1058    UNUSED(nb);
1059    UNUSED(kmask1);
1060    UNUSED(kmask2);
1061    UNUSED(kmask3);
1062    UNUSED(utmp);
1063    ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1064#endif
1065}
1066
1067void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1068    assert(n % QK_K == 0);
1069    assert(nrc == 1);
1070    UNUSED(nrc);
1071    UNUSED(bx);
1072    UNUSED(by);
1073    UNUSED(bs);
1074
1075    const block_q6_K * GGML_RESTRICT x = vx;
1076    const block_q8_K * GGML_RESTRICT y = vy;
1077
1078    const int nb = n / QK_K;
1079
1080#if defined(__VXE__) || defined(__VXE2__)
1081    float sum = 0;
1082
1083    // Lower 4-bit and upper 2-bit masks
1084    const uint8x16_t v_lm = vec_splat_u8(0x0F);
1085    const uint8x16_t v_um = vec_splat_u8(0x03);
1086
1087    const int32x4_t v_z = vec_splat_s32(0);
1088
1089    int8x16_t  q6b[4];
1090    uint8x16_t q6h[4];
1091
1092    uint8x16_t v_xl[4];
1093    uint8x16_t v_xh[2];
1094    int8x16_t  v_y[4];
1095
1096    for (int i = 0; i < nb; ++i) {
1097        const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
1098
1099        const uint8_t * GGML_RESTRICT x0l = x[i].ql;
1100        const uint8_t * GGML_RESTRICT x0h = x[i].qh;
1101        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
1102
1103        const int8_t  * GGML_RESTRICT scale = x[i].scales;
1104
1105        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
1106        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
1107
1108        const int8x16_t v_scale  = vec_xl(0, scale);
1109        const int16x8_t v_scalel = vec_unpackh(v_scale);
1110        const int16x8_t v_scaleh = vec_unpackl(v_scale);
1111
1112        const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
1113        const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
1114        const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
1115        const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
1116        const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
1117
1118        const int32_t mins = vec_hsum_i32x4(v_mins);
1119
1120        int32_t isum = 0;
1121        for (int j = 0; j < QK_K/128; ++j) {
1122            // Load model upper 2 bits
1123            v_xh[0] = vec_xl(0 , x0h);
1124            v_xh[1] = vec_xl(16, x0h);
1125            x0h += 32;
1126
1127            // Load model lower 4 bits
1128            v_xl[0] = vec_xl(0 , x0l);
1129            v_xl[1] = vec_xl(16, x0l);
1130            v_xl[2] = vec_xl(32, x0l);
1131            v_xl[3] = vec_xl(48, x0l);
1132            x0l += 64;
1133
1134            // Load activation quants
1135            v_y[0] = vec_xl(0 , y0);
1136            v_y[1] = vec_xl(16, y0);
1137            v_y[2] = vec_xl(32, y0);
1138            v_y[3] = vec_xl(48, y0);
1139            y0 += 64;
1140
1141            q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
1142            q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
1143            uint8x16_t shifted = vec_sr(v_xh[0], 2);
1144            q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
1145            shifted = vec_sr(v_xh[1], 2);
1146            q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
1147
1148            q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
1149            q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
1150            q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
1151            q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
1152
1153            int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
1154            int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
1155            int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
1156            int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
1157
1158            isum += vec_hsum_i32x4(summs0) * scale[0] +
1159                    vec_hsum_i32x4(summs1) * scale[1] +
1160                    vec_hsum_i32x4(summs2) * scale[2] +
1161                    vec_hsum_i32x4(summs3) * scale[3];
1162
1163            scale += 4;
1164
1165
1166            // Load activation quants
1167            v_y[0] = vec_xl(0 , y0);
1168            v_y[1] = vec_xl(16, y0);
1169            v_y[2] = vec_xl(32, y0);
1170            v_y[3] = vec_xl(48, y0);
1171            y0 += 64;
1172
1173            shifted = vec_sr(v_xh[0], 4);
1174            q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
1175            shifted = vec_sr(v_xh[1], 4);
1176            q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
1177            shifted = vec_sr(v_xh[0], 6);
1178            q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
1179            shifted = vec_sr(v_xh[1], 6);
1180            q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
1181
1182            q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
1183            q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
1184            q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
1185            q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
1186
1187            summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
1188            summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
1189            summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
1190            summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
1191
1192            isum += vec_hsum_i32x4(summs0) * scale[0] +
1193                    vec_hsum_i32x4(summs1) * scale[1] +
1194                    vec_hsum_i32x4(summs2) * scale[2] +
1195                    vec_hsum_i32x4(summs3) * scale[3];
1196
1197            scale += 4;
1198        }
1199
1200        sum += d_all * y[i].d * (isum - 32 * mins);
1201    }
1202
1203    *s = sum;
1204
1205#else
1206    UNUSED(x);
1207    UNUSED(y);
1208    UNUSED(nb);
1209    ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1210#endif
1211}
1212
1213// #if defined(__VXE__) || defined(__VXE2__)
1214// static const int8_t keven_signs_q2xs[1024] = {
1215//      1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,
1216//      1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,
1217//      1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1, -1,
1218//      1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
1219//      1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,
1220//      1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
1221//      1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
1222//      1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1, -1,
1223//      1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1,
1224//      1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1,  1,
1225//      1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
1226//      1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,
1227//      1,  1,  1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,
1228//      1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1, -1,
1229//      1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1, -1,
1230//      1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1,  1,
1231//      1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
1232//      1,  1, -1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1,  1,
1233//      1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,
1234//      1,  1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,
1235//      1,  1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1,  1,
1236//      1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1, -1,
1237//      1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,
1238//      1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,
1239//      1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
1240//      1,  1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,
1241//      1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
1242//      1,  1, -1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
1243//      1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,
1244//      1,  1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,  1,
1245//      1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,
1246//      1,  1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
1247// };
1248// #endif
1249
1250// void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1251//     assert(n % QK_K == 0);
1252//     assert(nrc == 1);
1253//     UNUSED(nrc);
1254//     UNUSED(bx);
1255//     UNUSED(by);
1256//     UNUSED(bs);
1257
1258//     const block_iq2_xxs * GGML_RESTRICT x = vx;
1259//     const block_q8_K    * GGML_RESTRICT y = vy;
1260
1261//     const int nb = n / QK_K;
1262
1263// #if defined(__VXE__) || defined(__VXE2__)
1264//    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1265
1266//    uint32_t aux32[4];
1267//    const uint8_t * aux8 = (const uint8_t *)aux32;
1268
1269//    float sumf = 0;
1270
1271//    for (int i = 0; i < nb; ++i) {
1272//        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1273//        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1274//        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
1275
1276//        float sumf1 = 0, sumf2 = 0;
1277
1278//        for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
1279//            int8x16_t q8b0 = vec_xl( 0, q8);
1280//            int8x16_t qb81 = vec_xl(16, q8);
1281//            int8x16_t q8b2 = vec_xl(32, q8);
1282//            int8x16_t q8b3 = vec_xl(48, q8);
1283//            q8 += 64;
1284
1285//            memcpy(aux32, q2, 4 * sizeof(uint32_t));
1286//            q2 += 8;
1287
1288//            int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
1289//            int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
1290//            int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
1291//            int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
1292
1293//            int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >>  7) & 127)) };
1294//            int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
1295//            int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >>  7) & 127)) };
1296//            int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
1297
1298//            q2u0 = vec_mul(q2u0, q2s0);
1299//            q2u1 = vec_mul(q2u1, q2s1);
1300//            q2u2 = vec_mul(q2u2, q2s2);
1301//            q2u3 = vec_mul(q2u3, q2s3);
1302
1303//            const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
1304//            const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
1305
1306//            sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
1307//            sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
1308//        }
1309
1310//        sumf += d * (sumf1 + sumf2);
1311//    }
1312
1313//    *s = 0.25f * sumf;
1314
1315// #else
1316
1317//     uint32_t aux32[2];
1318//     const uint8_t * aux8 = (const uint8_t *)aux32;
1319
1320//     float sumf = 0.f;
1321//     for (int i = 0; i < nb; ++i) {
1322//         const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1323//         const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1324//         const int8_t   * GGML_RESTRICT q8 = y[i].qs;
1325//         int32_t bsum = 0;
1326//         for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1327//             memcpy(aux32, q2, 2*sizeof(uint32_t));
1328//             q2 += 4;
1329//             const uint32_t ls = 2*(aux32[1] >> 28) + 1;
1330//             int32_t sumi = 0;
1331//             for (int l = 0; l < 4; ++l) {
1332//                 const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
1333//                 const uint8_t  signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
1334//                 for (int j = 0; j < 8; ++j) {
1335//                     sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1336//                 }
1337//                 q8 += 8;
1338//             }
1339//             bsum += sumi * ls;
1340//         }
1341//         sumf += d * bsum;
1342//     }
1343//     *s = 0.125f * sumf;
1344// #endif
1345// }
1346
1347void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1348    assert(nrc == 1);
1349    UNUSED(nrc);
1350    UNUSED(bx);
1351    UNUSED(by);
1352    UNUSED(bs);
1353    assert(n % QK4_NL == 0);
1354    static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
1355
1356    const block_iq4_nl * GGML_RESTRICT x = vx;
1357    const block_q8_0   * GGML_RESTRICT y = vy;
1358
1359    const int nb = n / QK4_NL;
1360
1361    int ib = 0;
1362    float sumf = 0;
1363
1364#if defined(__VXE__) || defined(__VXE2__)
1365    const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
1366    const uint8x16_t v_m = vec_splat_u8(0x0F);
1367
1368    for (; ib < nb; ++ib) {
1369        const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
1370        const block_q8_0   * GGML_RESTRICT y0 = &y[ib];
1371
1372        const uint8x16_t v_x = vec_xl(0, x0->qs);
1373        int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
1374        int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
1375
1376        v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
1377        v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
1378
1379        const int8x16_t v_yl = vec_xl(0      , y0->qs);
1380        const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
1381        const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
1382
1383        sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
1384    }
1385
1386    *s = sumf;
1387#else
1388    UNUSED(x);
1389    UNUSED(y);
1390    UNUSED(nb);
1391    UNUSED(ib);
1392    UNUSED(sumf);
1393    ggml_vec_dot_iq4_nl_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
1394#endif
1395}
1396
1397void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1398    assert(nrc == 1);
1399    UNUSED(nrc);
1400    UNUSED(bx);
1401    UNUSED(by);
1402    UNUSED(bs);
1403    assert(n % QK_K == 0);
1404
1405    const block_iq4_xs * GGML_RESTRICT x = vx;
1406    const block_q8_K   * GGML_RESTRICT y = vy;
1407
1408    const int nb = n / QK_K;
1409
1410#if defined(__VXE__) || defined(__VXE2__)
1411    const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
1412    const uint8x16_t v_m = vec_splat_u8(0x0F);
1413
1414    float sumf = 0;
1415
1416    for (int ibl = 0; ibl < nb; ++ibl) {
1417        const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
1418        const int8_t  * GGML_RESTRICT q8 = y[ibl].qs;
1419
1420        uint16_t h = x[ibl].scales_h;
1421
1422        int sumi1 = 0, sumi2 = 0;
1423        for (int ib = 0; ib < QK_K/64; ++ib) {
1424            const uint8x16_t v_x0 = vec_xl(0       , q4);
1425            const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
1426            q4 += 32;
1427
1428            int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
1429            int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
1430            int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
1431            int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
1432
1433            v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
1434            v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
1435            v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
1436            v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
1437
1438            const int8x16_t v_y0 = vec_xl( 0, q8);
1439            const int8x16_t v_y1 = vec_xl(16, q8);
1440            const int8x16_t v_y2 = vec_xl(32, q8);
1441            const int8x16_t v_y3 = vec_xl(48, q8);
1442            q8 += 64;
1443
1444            int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
1445            int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
1446
1447            int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
1448            int ls2 = ((x[ibl].scales_l[ib] >>  4) | ((h << 2) & 0x30)) - 32;
1449
1450            h >>= 4;
1451
1452            sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
1453            sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
1454        }
1455
1456        sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
1457    }
1458
1459    *s = sumf;
1460
1461#else
1462    UNUSED(x);
1463    UNUSED(y);
1464    UNUSED(nb);
1465    ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
1466#endif
1467}
1468