1#pragma once
   2
   3#include "ggml-cpu-impl.h"
   4
   5#ifdef __ARM_FEATURE_SVE
   6#include <arm_sve.h>
   7#endif // __ARM_FEATURE_SVE
   8
   9#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
  10// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
  11//
  12//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
  13//
  14#include <arm_neon.h>
  15#endif
  16
  17#if defined(__riscv_v_intrinsic)
  18#include <riscv_vector.h>
  19#endif
  20
  21#ifdef __cplusplus
  22extern "C" {
  23#endif
  24
  25//
  26// simd mappings
  27//
  28
  29// FP16 to FP32 conversion
  30
  31// 16-bit float
  32// on Arm, we use __fp16
  33// on x86, we use uint16_t
  34//
  35// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
  36// for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
  37//
  38#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
  39    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
  40    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
  41
  42    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
  43
  44    static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
  45        __fp16 tmp;
  46        memcpy(&tmp, &h, sizeof(ggml_fp16_t));
  47        return (float)tmp;
  48    }
  49
  50    static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
  51        ggml_fp16_t res;
  52        __fp16 tmp = f;
  53        memcpy(&res, &tmp, sizeof(ggml_fp16_t));
  54        return res;
  55    }
  56#elif defined(__F16C__)
  57    #ifdef _MSC_VER
  58        #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
  59        #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
  60    #else
  61        #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
  62        #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
  63    #endif
  64#elif defined(__POWER9_VECTOR__)
  65    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
  66    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
  67    /* the inline asm below is about 12% faster than the lookup method */
  68    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
  69    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
  70
  71    static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
  72        float f;
  73        double d;
  74        __asm__(
  75            "mtfprd %0,%2\n"
  76            "xscvhpdp %0,%0\n"
  77            "frsp %1,%0\n" :
  78            /* temp */ "=d"(d),
  79            /* out */  "=f"(f):
  80            /* in */   "r"(h));
  81        return f;
  82    }
  83
  84    static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
  85        double d;
  86        ggml_fp16_t r;
  87        __asm__( /* xscvdphp can work on double or single precision */
  88            "xscvdphp %0,%2\n"
  89            "mffprd %1,%0\n" :
  90            /* temp */ "=d"(d),
  91            /* out */  "=r"(r):
  92            /* in */   "f"(f));
  93        return r;
  94    }
  95#elif defined(__riscv) && defined(__riscv_zfhmin)
  96    static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
  97        _Float16 hf;
  98        memcpy(&hf, &h, sizeof(ggml_fp16_t));
  99        return hf;
 100    }
 101
 102    static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
 103        ggml_fp16_t res;
 104        _Float16 hf = (_Float16)f;
 105        memcpy(&res, &hf, sizeof(ggml_fp16_t));
 106        return res;
 107    }
 108
 109    #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
 110    #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
 111    #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
 112    #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
 113#endif
 114
 115// precomputed f32 table for f16 (256 KB)
 116// defined in ggml-cpu.c, initialized in ggml_cpu_init()
 117extern float ggml_table_f32_f16[1 << 16];
 118
 119// precomputed f32 table for e8m0 half (1 KB)
 120// defined in ggml-cpu.c, initialized in ggml_cpu_init()
 121extern float ggml_table_f32_e8m0_half[1 << 8];
 122
 123// Use lookup table for E8M0 on x86 (faster than bit manipulation)
 124#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 125#define GGML_CPU_E8M0_TO_FP32_HALF(x) ggml_table_f32_e8m0_half[(uint8_t)(x)]
 126#else
 127#define GGML_CPU_E8M0_TO_FP32_HALF(x) GGML_E8M0_TO_FP32_HALF(x)
 128#endif
 129
 130// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 131// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
 132// This is also true for POWER9.
 133#if !defined(GGML_CPU_FP16_TO_FP32)
 134inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 135    uint16_t s;
 136    memcpy(&s, &f, sizeof(uint16_t));
 137    return ggml_table_f32_f16[s];
 138}
 139
 140#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
 141#endif
 142
 143#if !defined(GGML_CPU_FP32_TO_FP16)
 144#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 145#endif
 146
 147
 148// we define a common set of C macros which map to specific intrinsics based on the current architecture
 149// we then implement the fundamental computation operations below using only these macros
 150// adding support for new architectures requires to define the corresponding SIMD macros
 151//
 152// GGML_F32_STEP / GGML_F16_STEP
 153//   number of elements to process in a single step
 154//
 155// GGML_F32_EPR / GGML_F16_EPR
 156//   number of elements to fit in a single register
 157//
 158
 159#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
 160
 161#define GGML_SIMD
 162
 163// F32 SVE
 164#define GGML_F32_EPR 8
 165#define DEFAULT_PG svptrue_b32()
 166
 167#define GGML_F32xt                        svfloat32_t
 168#define GGML_F32xt_ZERO                   svdup_n_f32(0.0f)
 169#define GGML_F32xt_SET1(x)                svdup_n_f32(x)
 170#define GGML_F32xt_LOAD_IMPL(pg, a)       svld1_f32(pg, a)
 171#define GGML_F32xt_LOAD(a)                GGML_F32xt_LOAD_IMPL(DEFAULT_PG, a)
 172#define GGML_F32xt_STORE_IMPL(pg, a, b)   svst1_f32(pg, a, b)
 173#define GGML_F32xt_STORE(a, b)            GGML_F32xt_STORE_IMPL(DEFAULT_PG, a, b)
 174#define GGML_F32xt_FMA_IMPL(pg, a, b, c)  svmad_f32_m(pg, b, c, a)
 175#define GGML_F32xt_FMA(a, b, c)           GGML_F32xt_FMA_IMPL(DEFAULT_PG, a, b, c)
 176#define GGML_F32xt_ADD_IMPL(pg, a, b)     svadd_f32_m(pg, a, b)
 177#define GGML_F32xt_ADD(a, b)              GGML_F32xt_ADD_IMPL(DEFAULT_PG, a, b)
 178#define GGML_F32xt_MUL_IMPL(pg, a, b)     svmul_f32_m(pg, a, b)
 179#define GGML_F32xt_MUL(a, b)              GGML_F32xt_MUL_IMPL(DEFAULT_PG, a, b)
 180#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
 181#define GGML_F32xt_REDUCE_ONE(a)          GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, a)
 182#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)  \
 183{                                                      \
 184    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2);        \
 185    sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4);        \
 186    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6);        \
 187    sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8);        \
 188    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3);        \
 189    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7);        \
 190    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5);        \
 191    (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1);  \
 192}
 193#define GGML_F32xt_REDUCE(res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)  \
 194        GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)
 195
 196#define GGML_F32_VEC        GGML_F32xt
 197#define GGML_F32_VEC_ZERO   GGML_F32xt_ZERO
 198#define GGML_F32_VEC_SET1   GGML_F32xt_SET1
 199#define GGML_F32_VEC_LOAD   GGML_F32xt_LOAD
 200#define GGML_F32_VEC_STORE  GGML_F32xt_STORE
 201#define GGML_F32_VEC_FMA    GGML_F32xt_FMA
 202#define GGML_F32_VEC_ADD    GGML_F32xt_ADD
 203#define GGML_F32_VEC_MUL    GGML_F32xt_MUL
 204#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
 205
 206// F16 SVE
 207#define DEFAULT_PG32    svptrue_b32()
 208#define DEFAULT_PG16    svptrue_b16()
 209
 210#define GGML_F32Cxt                         svfloat16_t
 211#define GGML_F32Cxt_ZERO                    svdup_n_f16(0.0f)
 212#define GGML_F32Cxt_SET1(x)                 svdup_n_f16(x)
 213#define GGML_F32Cxt_LOAD(p)                 svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
 214#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
 215
 216#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c)   svmad_f16_x(pg, b, c, a)
 217#define GGML_F32Cxt_FMA(a, b, c)            GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, a, b, c)
 218#define GGML_F32Cxt_ADD_IMPL(pg, a, b)      svadd_f16_x(pg, a, b)
 219#define GGML_F32Cxt_ADD(a, b)               GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, a, b)
 220#define GGML_F32Cxt_MUL_IMPL(pg, a, b)      svmul_f16_x(pg, a, b)
 221#define GGML_F32Cxt_MUL(a, b)               GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, a, b)
 222#define GGML_F32Cxt_REDUCE                  GGML_F16xt_REDUCE_MIXED
 223
 224#define GGML_F16x_VEC                GGML_F32Cxt
 225#define GGML_F16x_VEC_ZERO           GGML_F32Cxt_ZERO
 226#define GGML_F16x_VEC_SET1           GGML_F32Cxt_SET1
 227#define GGML_F16x_VEC_LOAD(p, i)     GGML_F32Cxt_LOAD(p)
 228#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
 229#define GGML_F16x_VEC_FMA            GGML_F32Cxt_FMA
 230#define GGML_F16x_VEC_ADD            GGML_F32Cxt_ADD
 231#define GGML_F16x_VEC_MUL            GGML_F32Cxt_MUL
 232#define GGML_F16x_VEC_REDUCE         GGML_F32Cxt_REDUCE
 233
 234#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
 235#define GGML_F16xt_REDUCE_ONE(a)          GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, a)
 236
 237#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4)  \
 238{                                                      \
 239    sum1 = svadd_f16_x(pg16, sum1, sum2);              \
 240    sum3 = svadd_f16_x(pg16, sum3, sum4);              \
 241    sum1 = svadd_f16_x(pg16, sum1, sum3);              \
 242    __fp16 sum_f16 = svaddv_f16(pg16, sum1);           \
 243    (res) = (ggml_float) sum_f16;                      \
 244}
 245#define GGML_F16xt_REDUCE_MIXED(res, sum1, sum2, sum3, sum4)  \
 246        GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, res, sum1, sum2, sum3, sum4)
 247
 248// F16 NEON
 249
 250#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 251    #define GGML_F16_STEP 32
 252    #define GGML_F16_EPR  8
 253
 254    #define GGML_F16x8              float16x8_t
 255    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
 256    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
 257    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
 258    #define GGML_F16x8_STORE        vst1q_f16
 259    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
 260    #define GGML_F16x8_ADD          vaddq_f16
 261    #define GGML_F16x8_MUL          vmulq_f16
 262    #define GGML_F16x8_REDUCE(res, x)                               \
 263    do {                                                            \
 264        int offset = GGML_F16_ARR >> 1;                             \
 265        for (int i = 0; i < offset; ++i) {                          \
 266            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
 267        }                                                           \
 268        offset >>= 1;                                               \
 269        for (int i = 0; i < offset; ++i) {                          \
 270            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
 271        }                                                           \
 272        offset >>= 1;                                               \
 273        for (int i = 0; i < offset; ++i) {                          \
 274            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
 275        }                                                           \
 276        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
 277        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
 278        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
 279    } while (0)
 280
 281    #define GGML_F16_VEC                GGML_F16x8
 282    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
 283    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
 284    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
 285    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
 286    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
 287    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
 288    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
 289    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
 290#else
 291    // if FP16 vector arithmetic is not supported, we use FP32 instead
 292    // and take advantage of the vcvt_ functions to convert to/from FP16
 293
 294    #define GGML_F16_STEP 16
 295    #define GGML_F16_EPR  4
 296
 297    #define GGML_F32Cx4              float32x4_t
 298    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
 299    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
 300    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
 301    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
 302    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
 303    #define GGML_F32Cx4_ADD          vaddq_f32
 304    #define GGML_F32Cx4_MUL          vmulq_f32
 305    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
 306
 307    #define GGML_F16_VEC                GGML_F32Cx4
 308    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
 309    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
 310    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
 311    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
 312    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
 313    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
 314    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
 315    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
 316#endif
 317
 318#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 319
 320#define GGML_SIMD
 321
 322// F32 NEON
 323
 324#define GGML_F32_STEP 16
 325#define GGML_F32_EPR  4
 326
 327#define GGML_F32x4              float32x4_t
 328#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
 329#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
 330#define GGML_F32x4_LOAD         vld1q_f32
 331#define GGML_F32x4_STORE        vst1q_f32
 332#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
 333#define GGML_F32x4_ADD          vaddq_f32
 334#define GGML_F32x4_MUL          vmulq_f32
 335#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
 336#define GGML_F32x4_REDUCE(res, x)                       \
 337{                                                       \
 338    int offset = GGML_F32_ARR >> 1;                     \
 339    for (int i = 0; i < offset; ++i) {                  \
 340        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
 341    }                                                   \
 342    offset >>= 1;                                       \
 343    for (int i = 0; i < offset; ++i) {                  \
 344        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
 345    }                                                   \
 346    offset >>= 1;                                       \
 347    for (int i = 0; i < offset; ++i) {                  \
 348        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
 349    }                                                   \
 350    (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
 351}
 352
 353#define GGML_F32_VEC        GGML_F32x4
 354#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
 355#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
 356#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
 357#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
 358#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
 359#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
 360#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
 361#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
 362
 363// F16 NEON
 364
 365#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 366    #define GGML_F16_STEP 32
 367    #define GGML_F16_EPR  8
 368
 369    #define GGML_F16x8              float16x8_t
 370    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
 371    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
 372    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
 373    #define GGML_F16x8_STORE        vst1q_f16
 374    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
 375    #define GGML_F16x8_ADD          vaddq_f16
 376    #define GGML_F16x8_MUL          vmulq_f16
 377    #define GGML_F16x8_REDUCE(res, x)                               \
 378    do {                                                            \
 379        int offset = GGML_F16_ARR >> 1;                             \
 380        for (int i = 0; i < offset; ++i) {                          \
 381            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
 382        }                                                           \
 383        offset >>= 1;                                               \
 384        for (int i = 0; i < offset; ++i) {                          \
 385            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
 386        }                                                           \
 387        offset >>= 1;                                               \
 388        for (int i = 0; i < offset; ++i) {                          \
 389            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
 390        }                                                           \
 391        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
 392        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
 393        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
 394    } while (0)
 395
 396    #define GGML_F16_VEC                GGML_F16x8
 397    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
 398    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
 399    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
 400    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
 401    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
 402    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
 403    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
 404    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
 405#else
 406    // if FP16 vector arithmetic is not supported, we use FP32 instead
 407    // and take advantage of the vcvt_ functions to convert to/from FP16
 408
 409    #define GGML_F16_STEP 16
 410    #define GGML_F16_EPR  4
 411
 412    #define GGML_F32Cx4              float32x4_t
 413    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
 414    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
 415    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
 416    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
 417    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
 418    #define GGML_F32Cx4_ADD          vaddq_f32
 419    #define GGML_F32Cx4_MUL          vmulq_f32
 420    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
 421
 422    #define GGML_F16_VEC                GGML_F32Cx4
 423    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
 424    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
 425    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
 426    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
 427    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
 428    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
 429    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
 430    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
 431#endif
 432
 433#elif defined(__AVX512F__)
 434
 435#define GGML_SIMD
 436
 437// F32 AVX512
 438
 439#define GGML_F32_STEP 64
 440#define GGML_F32_EPR  16
 441
 442#define GGML_F32x16         __m512
 443#define GGML_F32x16_ZERO    _mm512_setzero_ps()
 444#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
 445#define GGML_F32x16_LOAD    _mm512_loadu_ps
 446#define GGML_F32x16_STORE   _mm512_storeu_ps
 447// _mm512_fmadd_ps is defined in AVX512F so no guard is required
 448#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
 449#define GGML_F32x16_ADD     _mm512_add_ps
 450#define GGML_F32x16_MUL     _mm512_mul_ps
 451#define GGML_F32x16_REDUCE(res, x)                                    \
 452do {                                                                  \
 453    int offset = GGML_F32_ARR >> 1;                                   \
 454    for (int i = 0; i < offset; ++i) {                                \
 455        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
 456    }                                                                 \
 457    offset >>= 1;                                                     \
 458    for (int i = 0; i < offset; ++i) {                                \
 459        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
 460    }                                                                 \
 461    offset >>= 1;                                                     \
 462    for (int i = 0; i < offset; ++i) {                                \
 463        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
 464    }                                                                 \
 465    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                    \
 466} while (0)
 467
 468// TODO: is this optimal ?
 469
 470#define GGML_F32_VEC        GGML_F32x16
 471#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
 472#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
 473#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
 474#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
 475#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
 476#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
 477#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
 478#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
 479
 480// F16 AVX512
 481
 482// F16 AVX
 483
 484#define GGML_F16_STEP 64
 485#define GGML_F16_EPR  16
 486
 487// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
 488
 489#define GGML_F32Cx16             __m512
 490#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
 491#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
 492
 493// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
 494// so F16C guard isn't required
 495#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
 496#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
 497
 498#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
 499#define GGML_F32Cx16_ADD         _mm512_add_ps
 500#define GGML_F32Cx16_MUL         _mm512_mul_ps
 501#define GGML_F32Cx16_REDUCE(res, x)                               \
 502do {                                                              \
 503    int offset = GGML_F32_ARR >> 1;                               \
 504    for (int i = 0; i < offset; ++i) {                            \
 505        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
 506    }                                                             \
 507    offset >>= 1;                                                 \
 508    for (int i = 0; i < offset; ++i) {                            \
 509        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
 510    }                                                             \
 511    offset >>= 1;                                                 \
 512    for (int i = 0; i < offset; ++i) {                            \
 513        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
 514    }                                                             \
 515    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                \
 516} while (0)
 517
 518#define GGML_F16_VEC                GGML_F32Cx16
 519#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
 520#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
 521#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
 522#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
 523#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
 524#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
 525#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
 526
 527#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
 528#elif defined(__AVX__)
 529
 530#define GGML_SIMD
 531
 532// F32 AVX
 533
 534#define GGML_F32_STEP 32
 535#define GGML_F32_EPR  8
 536
 537#define GGML_F32x8         __m256
 538#define GGML_F32x8_ZERO    _mm256_setzero_ps()
 539#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
 540#define GGML_F32x8_LOAD    _mm256_loadu_ps
 541#define GGML_F32x8_STORE   _mm256_storeu_ps
 542#if defined(__FMA__)
 543    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
 544#else
 545    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
 546#endif
 547#define GGML_F32x8_ADD     _mm256_add_ps
 548#define GGML_F32x8_MUL     _mm256_mul_ps
 549#define GGML_F32x8_REDUCE(res, x)                                 \
 550do {                                                              \
 551    int offset = GGML_F32_ARR >> 1;                               \
 552    for (int i = 0; i < offset; ++i) {                            \
 553        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
 554    }                                                             \
 555    offset >>= 1;                                                 \
 556    for (int i = 0; i < offset; ++i) {                            \
 557        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
 558    }                                                             \
 559    offset >>= 1;                                                 \
 560    for (int i = 0; i < offset; ++i) {                            \
 561        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
 562    }                                                             \
 563    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
 564                                 _mm256_extractf128_ps(x[0], 1)); \
 565    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
 566    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
 567} while (0)
 568// TODO: is this optimal ?
 569
 570#define GGML_F32_VEC        GGML_F32x8
 571#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
 572#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
 573#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
 574#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
 575#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
 576#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
 577#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
 578#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
 579
 580// F16 AVX
 581
 582#define GGML_F16_STEP 32
 583#define GGML_F16_EPR  8
 584
 585// F16 arithmetic is not supported by AVX, so we use F32 instead
 586
 587#define GGML_F32Cx8             __m256
 588#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
 589#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
 590
 591#if defined(__F16C__)
 592// the  _mm256_cvt intrinsics require F16C
 593#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
 594#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
 595#else
 596static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
 597    float tmp[8];
 598
 599    for (int i = 0; i < 8; i++) {
 600        tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
 601    }
 602
 603    return _mm256_loadu_ps(tmp);
 604}
 605static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
 606    float arr[8];
 607
 608    _mm256_storeu_ps(arr, y);
 609
 610    for (int i = 0; i < 8; i++)
 611        x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
 612}
 613#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
 614#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
 615#endif
 616
 617#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
 618#define GGML_F32Cx8_ADD         _mm256_add_ps
 619#define GGML_F32Cx8_MUL         _mm256_mul_ps
 620#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
 621
 622#define GGML_F16_VEC                GGML_F32Cx8
 623#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
 624#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
 625#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
 626#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
 627#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
 628#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
 629#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
 630#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
 631
 632#elif defined(__POWER9_VECTOR__)
 633
 634#define GGML_SIMD
 635
 636// F32 POWER9
 637
 638#define GGML_F32_STEP 32
 639#define GGML_F32_EPR  4
 640
 641#define GGML_F32x4              vector float
 642#define GGML_F32x4_ZERO         {0.0f}
 643#define GGML_F32x4_SET1         vec_splats
 644#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
 645#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
 646#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
 647#define GGML_F32x4_ADD          vec_add
 648#define GGML_F32x4_MUL          vec_mul
 649#define GGML_F32x4_REDUCE(res, x)              \
 650{                                              \
 651    int offset = GGML_F32_ARR >> 1;            \
 652    for (int i = 0; i < offset; ++i) {         \
 653        x[i] = vec_add(x[i], x[offset+i]);     \
 654    }                                          \
 655    offset >>= 1;                              \
 656    for (int i = 0; i < offset; ++i) {         \
 657        x[i] = vec_add(x[i], x[offset+i]);     \
 658    }                                          \
 659    offset >>= 1;                              \
 660    for (int i = 0; i < offset; ++i) {         \
 661        x[i] = vec_add(x[i], x[offset+i]);     \
 662    }                                          \
 663    res = vec_extract(x[0], 0) +               \
 664          vec_extract(x[0], 1) +               \
 665          vec_extract(x[0], 2) +               \
 666          vec_extract(x[0], 3);                \
 667}
 668#define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3)        \
 669{                                                       \
 670    vector float v = vec_add(vec_add(s0, s1),           \
 671                             vec_add(s2, s3));          \
 672    v = vec_add(v, vec_sld(v, v, 8));                   \
 673    v = vec_add(v, vec_sld(v, v, 4));                   \
 674    res += (ggml_float) vec_extract(v, 0);              \
 675}
 676
 677#define GGML_F32_VEC        GGML_F32x4
 678#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
 679#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
 680#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
 681#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
 682#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
 683#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
 684#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
 685#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
 686
 687// F16 POWER9
 688#define GGML_F16_STEP       GGML_F32_STEP
 689#define GGML_F16_EPR        GGML_F32_EPR
 690#define GGML_F16_VEC        GGML_F32x4
 691#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
 692#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
 693#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
 694#define GGML_F16_VEC_ADD    GGML_F32x4_ADD
 695#define GGML_F16_VEC_MUL    GGML_F32x4_MUL
 696#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
 697// Use vec_xl, not vec_ld, in case the load address is not aligned.
 698#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
 699  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
 700  vec_extract_fp32_from_shortl(vec_xl(0, p))
 701static inline unsigned char ggml_endian_byte(int i) {
 702       uint16_t tmp_val = 1;
 703       return ((unsigned char *)&tmp_val)[i];
 704}
 705#define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
 706#define GGML_F16_VEC_STORE(p, r, i)                             \
 707  if (i & 0x1)                                                  \
 708    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
 709                                   r[i - GGML_ENDIAN_BYTE(0)]), \
 710            0, p - GGML_F16_EPR)
 711
 712//BF16 POWER9
 713#define GGML_BF16_STEP 16
 714#define GGML_BF16_EPR  8
 715
 716#define GGML_BF16x8         vector unsigned short
 717#define GGML_BF16x8_ZERO    vec_splats((unsigned short)0)
 718#define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
 719
 720#define GGML_BF16_VEC          GGML_BF16x8
 721#define GGML_BF16_VEC_ZERO     GGML_BF16x8_ZERO
 722#define GGML_BF16_VEC_LOAD     GGML_BF16x8_LOAD
 723#if defined(__LITTLE_ENDIAN__)
 724#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel(GGML_BF16_VEC_ZERO, (v)))
 725#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh(GGML_BF16_VEC_ZERO, (v)))
 726#else
 727#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel((v), GGML_BF16_VEC_ZERO))
 728#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh((v), GGML_BF16_VEC_ZERO))
 729#endif
 730#define GGML_BF16_FMA_LO(acc, x, y) \
 731    (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
 732#define GGML_BF16_FMA_HI(acc, x, y) \
 733    (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
 734
 735#elif defined(__wasm_simd128__)
 736
 737#define GGML_SIMD
 738
 739// F32 WASM
 740
 741#define GGML_F32_STEP 16
 742#define GGML_F32_EPR  4
 743
 744#define GGML_F32x4              v128_t
 745#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
 746#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
 747#define GGML_F32x4_LOAD         wasm_v128_load
 748#define GGML_F32x4_STORE        wasm_v128_store
 749#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
 750#define GGML_F32x4_ADD          wasm_f32x4_add
 751#define GGML_F32x4_MUL          wasm_f32x4_mul
 752#define GGML_F32x4_REDUCE(res, x)                  \
 753{                                                  \
 754    int offset = GGML_F32_ARR >> 1;                \
 755    for (int i = 0; i < offset; ++i) {             \
 756        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
 757    }                                              \
 758    offset >>= 1;                                  \
 759    for (int i = 0; i < offset; ++i) {             \
 760        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
 761    }                                              \
 762    offset >>= 1;                                  \
 763    for (int i = 0; i < offset; ++i) {             \
 764        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
 765    }                                              \
 766    res = wasm_f32x4_extract_lane(x[0], 0) +       \
 767          wasm_f32x4_extract_lane(x[0], 1) +       \
 768          wasm_f32x4_extract_lane(x[0], 2) +       \
 769          wasm_f32x4_extract_lane(x[0], 3);        \
 770}
 771
 772#define GGML_F32_VEC        GGML_F32x4
 773#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
 774#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
 775#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
 776#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
 777#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
 778#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
 779#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
 780#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
 781
 782// F16 WASM
 783
 784#define GGML_F16_STEP 16
 785#define GGML_F16_EPR  4
 786
 787inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
 788    float tmp[4];
 789
 790    tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
 791    tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
 792    tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
 793    tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
 794
 795    return wasm_v128_load(tmp);
 796}
 797
 798inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
 799    float tmp[4];
 800
 801    wasm_v128_store(tmp, x);
 802
 803    p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
 804    p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
 805    p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
 806    p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
 807}
 808
 809#define GGML_F16x4             v128_t
 810#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
 811#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
 812#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
 813#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
 814#define GGML_F16x4_FMA         GGML_F32x4_FMA
 815#define GGML_F16x4_ADD         wasm_f32x4_add
 816#define GGML_F16x4_MUL         wasm_f32x4_mul
 817#define GGML_F16x4_REDUCE(res, x)                           \
 818{                                                           \
 819    int offset = GGML_F16_ARR >> 1;                         \
 820    for (int i = 0; i < offset; ++i) {                      \
 821        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
 822    }                                                       \
 823    offset >>= 1;                                           \
 824    for (int i = 0; i < offset; ++i) {                      \
 825        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
 826    }                                                       \
 827    offset >>= 1;                                           \
 828    for (int i = 0; i < offset; ++i) {                      \
 829        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
 830    }                                                       \
 831    res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) +  \
 832          wasm_f32x4_extract_lane(x[0], 1) +                \
 833          wasm_f32x4_extract_lane(x[0], 2) +                \
 834          wasm_f32x4_extract_lane(x[0], 3));                \
 835}
 836
 837#define GGML_F16_VEC                GGML_F16x4
 838#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
 839#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
 840#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
 841#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
 842#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
 843#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
 844#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
 845#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
 846
 847#elif defined(__SSE3__)
 848
 849#define GGML_SIMD
 850
 851// F32 SSE
 852
 853#define GGML_F32_STEP 32
 854#define GGML_F32_EPR  4
 855
 856#define GGML_F32x4         __m128
 857#define GGML_F32x4_ZERO    _mm_setzero_ps()
 858#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
 859#define GGML_F32x4_LOAD    _mm_loadu_ps
 860#define GGML_F32x4_STORE   _mm_storeu_ps
 861#if defined(__FMA__)
 862    // TODO: Does this work?
 863    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
 864#else
 865    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
 866#endif
 867#define GGML_F32x4_ADD     _mm_add_ps
 868#define GGML_F32x4_MUL     _mm_mul_ps
 869#define GGML_F32x4_REDUCE(res, x)                                 \
 870{                                                                 \
 871    int offset = GGML_F32_ARR >> 1;                               \
 872    for (int i = 0; i < offset; ++i) {                            \
 873        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
 874    }                                                             \
 875    offset >>= 1;                                                 \
 876    for (int i = 0; i < offset; ++i) {                            \
 877        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
 878    }                                                             \
 879    offset >>= 1;                                                 \
 880    for (int i = 0; i < offset; ++i) {                            \
 881        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
 882    }                                                             \
 883    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
 884    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
 885}
 886// TODO: is this optimal ?
 887
 888#define GGML_F32_VEC        GGML_F32x4
 889#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
 890#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
 891#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
 892#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
 893#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
 894#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
 895#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
 896#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
 897
 898// F16 SSE
 899
 900#define GGML_F16_STEP 32
 901#define GGML_F16_EPR  4
 902
 903static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
 904    float tmp[4];
 905
 906    tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
 907    tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
 908    tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
 909    tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
 910
 911    return _mm_loadu_ps(tmp);
 912}
 913
 914static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
 915    float arr[4];
 916
 917    _mm_storeu_ps(arr, y);
 918
 919    x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
 920    x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
 921    x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
 922    x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
 923}
 924
 925#define GGML_F32Cx4             __m128
 926#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
 927#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
 928#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
 929#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
 930#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
 931#define GGML_F32Cx4_ADD         _mm_add_ps
 932#define GGML_F32Cx4_MUL         _mm_mul_ps
 933#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
 934
 935#define GGML_F16_VEC                 GGML_F32Cx4
 936#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
 937#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
 938#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
 939#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
 940#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
 941#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
 942#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
 943#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
 944
 945#elif defined(__loongarch_asx)
 946
 947#define GGML_SIMD
 948
 949// F32 LASX
 950#define GGML_F32_STEP 32
 951#define GGML_F32_EPR  8
 952
 953#define GGML_F32x8         __m256
 954#define GGML_F32x8_ZERO    (__m256)__lasx_xvldi(0)
 955#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
 956#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
 957#define GGML_F32x8_STORE(x,y)   __lasx_xvst((y), (x), 0)
 958#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
 959#define GGML_F32x8_ADD     __lasx_xvfadd_s
 960#define GGML_F32x8_MUL     __lasx_xvfmul_s
 961#define GGML_F32x8_REDUCE(res, x)                                 \
 962do {                                                              \
 963    int offset = GGML_F32_ARR >> 1;                               \
 964    for (int i = 0; i < offset; ++i) {                            \
 965        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
 966    }                                                             \
 967    offset >>= 1;                                                 \
 968    for (int i = 0; i < offset; ++i) {                            \
 969        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
 970    }                                                             \
 971    offset >>= 1;                                                 \
 972    for (int i = 0; i < offset; ++i) {                            \
 973        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
 974    }                                                             \
 975    float *tmp_p = (float *)&x[0]; \
 976    res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7];  \
 977} while (0)
 978// TODO: is this optimal ?
 979
 980#define GGML_F32_VEC        GGML_F32x8
 981#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
 982#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
 983#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
 984#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
 985#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
 986#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
 987#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
 988#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
 989
 990// F16 LASX
 991
 992#define GGML_F16_STEP 32
 993#define GGML_F16_EPR  8
 994
 995// F16 arithmetic is not supported by LASX, so we use F32 instead
 996
 997#define GGML_F32Cx8          __m256
 998#define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
 999#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
1000
1001static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
1002    __m256i a;
1003    memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
1004    a = __lasx_xvpermi_d(a, 0 | (1 << 4));
1005    return __lasx_xvfcvtl_s_h(a);
1006}
1007
1008static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1009    __m256i a = __lasx_xvfcvt_h_s(y, y);
1010    a = __lasx_xvpermi_d(a, 0 | (2 << 2));
1011    memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
1012}
1013#define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
1014#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
1015
1016#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
1017#define GGML_F32Cx8_ADD         __lasx_xvfadd_s
1018#define GGML_F32Cx8_MUL         __lasx_xvfmul_s
1019#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
1020
1021#define GGML_F16_VEC                GGML_F32Cx8
1022#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
1023#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
1024#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
1025#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
1026#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
1027#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
1028#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
1029#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
1030
1031#elif defined(__loongarch_sx)
1032
1033#define GGML_SIMD
1034
1035// F32 LSX
1036
1037#define GGML_F32_STEP 32
1038#define GGML_F32_EPR  4
1039
1040#define GGML_F32x4         __m128
1041#define GGML_F32x4_ZERO    (__m128)__lsx_vldi(0)
1042#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
1043#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
1044#define GGML_F32x4_STORE(x, y)   __lsx_vst(y, x, 0)
1045#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
1046#define GGML_F32x4_ADD     __lsx_vfadd_s
1047#define GGML_F32x4_MUL     __lsx_vfmul_s
1048
1049#define GGML_F32x4_REDUCE(res, x)                               \
1050{                                                               \
1051    int offset = GGML_F32_ARR >> 1;                             \
1052    for (int i = 0; i < offset; ++i) {                          \
1053        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
1054    }                                                           \
1055    offset >>= 1;                                               \
1056    for (int i = 0; i < offset; ++i) {                          \
1057        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
1058    }                                                           \
1059    offset >>= 1;                                               \
1060    for (int i = 0; i < offset; ++i) {                          \
1061        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
1062    }                                                           \
1063    __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
1064    __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
1065    __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1);          \
1066    __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2);     \
1067    __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2);     \
1068    __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4);          \
1069    res = (ggml_float) ((v4f32)t5)[0];                          \
1070}
1071
1072#define GGML_F32_VEC        GGML_F32x4
1073#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
1074#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
1075#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
1076#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
1077#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
1078#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
1079#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
1080#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1081
1082// F16 LSX
1083
1084#define GGML_F16_STEP 32
1085#define GGML_F16_EPR  4
1086
1087static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
1088    float tmp[4];
1089
1090    tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
1091    tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
1092    tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
1093    tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
1094
1095    return (__m128)__lsx_vld(tmp, 0);
1096}
1097
1098static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1099    float arr[4];
1100
1101    __lsx_vst(y, arr, 0);
1102
1103    x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
1104    x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
1105    x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
1106    x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
1107}
1108
1109#define GGML_F32Cx4             __m128
1110#define GGML_F32Cx4_ZERO        (__m128)__lsx_vldi(0)
1111#define GGML_F32Cx4_SET1(x)     (__m128)__lsx_vreplfr2vr_s((x))
1112#define GGML_F32Cx4_LOAD(x)     (__m128)__lsx_f16x4_load(x)
1113#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
1114#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
1115#define GGML_F32Cx4_ADD         __lsx_vfadd_s
1116#define GGML_F32Cx4_MUL         __lsx_vfmul_s
1117#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
1118
1119#define GGML_F16_VEC                 GGML_F32Cx4
1120#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
1121#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
1122#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
1123#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
1124#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
1125#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
1126#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
1127#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
1128
1129#elif defined(__VXE__) || defined(__VXE2__)
1130
1131#define GGML_SIMD
1132
1133// F32 s390x
1134
1135#define GGML_F32_STEP 32
1136#define GGML_F32_EPR  4
1137
1138#define GGML_F32x4              float32x4_t
1139#define GGML_F32x4_ZERO         vec_splats(0.0f)
1140#define GGML_F32x4_SET1         vec_splats
1141#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
1142#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
1143#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
1144#define GGML_F32x4_ADD          vec_add
1145#define GGML_F32x4_MUL          vec_mul
1146#define GGML_F32x4_REDUCE(res, x)                   \
1147{                                                   \
1148    int offset = GGML_F32_ARR >> 1;                 \
1149    for (int i = 0; i < offset; ++i) {              \
1150        x[i] = vec_add(x[i], x[offset + i]);        \
1151    }                                               \
1152    offset >>= 1;                                   \
1153    for (int i = 0; i < offset; ++i) {              \
1154        x[i] = vec_add(x[i], x[offset + i]);        \
1155    }                                               \
1156    offset >>= 1;                                   \
1157    for (int i = 0; i < offset; ++i) {              \
1158        x[i] = vec_add(x[i], x[offset + i]);        \
1159    }                                               \
1160    float32x4_t tmp = x[0] + vec_reve(x[0]);        \
1161    res = tmp[0] + tmp[1];                          \
1162}
1163
1164#define GGML_F32_VEC        GGML_F32x4
1165#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
1166#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
1167#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
1168#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
1169#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
1170#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
1171#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
1172#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1173
1174// F16 s390x
1175#define GGML_F16_STEP GGML_F32_STEP
1176#define GGML_F16_EPR  GGML_F32_EPR
1177
1178static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1179    float tmp[4];
1180
1181    for (int i = 0; i < 4; i++) {
1182        tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
1183    }
1184
1185    // note: keep type-cast here to prevent compiler bugs
1186    // see: https://github.com/ggml-org/llama.cpp/issues/12846
1187    return vec_xl(0, (const float *)(tmp));
1188}
1189
1190static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1191    float arr[4];
1192
1193    // note: keep type-cast here to prevent compiler bugs
1194    // see: https://github.com/ggml-org/llama.cpp/issues/12846
1195    vec_xst(v_y, 0, (float *)(arr));
1196
1197    for (int i = 0; i < 4; i++) {
1198        x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
1199    }
1200}
1201
1202#define GGML_F16_VEC                GGML_F32x4
1203#define GGML_F16_VEC_ZERO           GGML_F32x4_ZERO
1204#define GGML_F16_VEC_SET1           GGML_F32x4_SET1
1205#define GGML_F16_VEC_LOAD(p, i)     __lzs_f16cx4_load(p)
1206#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
1207#define GGML_F16_VEC_FMA            GGML_F32x4_FMA
1208#define GGML_F16_VEC_ADD            GGML_F32x4_ADD
1209#define GGML_F16_VEC_MUL            GGML_F32x4_MUL
1210#define GGML_F16_VEC_REDUCE         GGML_F32x4_REDUCE
1211
1212#elif defined(__riscv_v_intrinsic)
1213
1214// compatible with vlen >= 128
1215
1216#define GGML_SIMD
1217
1218// F32
1219
1220#define GGML_F32_STEP 16
1221#define GGML_F32_EPR  4
1222
1223#define GGML_F32x4              vfloat32m1_t
1224#define GGML_F32x4_ZERO         __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
1225#define GGML_F32x4_SET1(x)      __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
1226#define GGML_F32x4_LOAD(x)      __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
1227#define GGML_F32x4_STORE(b, v)  __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
1228#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
1229#define GGML_F32x4_ADD(a, b)    __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
1230#define GGML_F32x4_MUL(a, b)    __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
1231
1232#define GGML_F32_VEC        GGML_F32x4
1233#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
1234#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
1235#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
1236#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
1237#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
1238#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
1239#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
1240#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1241
1242#endif
1243
1244// GGML_F32_ARR / GGML_F16_ARR
1245//   number of registers to use per step
1246#ifdef GGML_SIMD
1247#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
1248#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
1249#endif
1250
1251#ifdef __cplusplus
1252}
1253#endif