summaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-cpu/simd-mappings.h
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/ggml/src/ggml-cpu/simd-mappings.h')
-rw-r--r--llama.cpp/ggml/src/ggml-cpu/simd-mappings.h1253
1 files changed, 1253 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h b/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h
new file mode 100644
index 0000000..630e506
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h
@@ -0,0 +1,1253 @@
+#pragma once
+
+#include "ggml-cpu-impl.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
+#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+#endif
+
+#if defined(__riscv_v_intrinsic)
+#include <riscv_vector.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// simd mappings
+//
+
+// FP16 to FP32 conversion
+
+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+//
+// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
+// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
+//
+#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
+
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
+
+ static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
+ __fp16 tmp;
+ memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+ return (float)tmp;
+ }
+
+ static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
+ ggml_fp16_t res;
+ __fp16 tmp = f;
+ memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+ return res;
+ }
+#elif defined(__F16C__)
+ #ifdef _MSC_VER
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+ #else
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+ #endif
+#elif defined(__POWER9_VECTOR__)
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
+ /* the inline asm below is about 12% faster than the lookup method */
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
+
+ static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
+ float f;
+ double d;
+ __asm__(
+ "mtfprd %0,%2\n"
+ "xscvhpdp %0,%0\n"
+ "frsp %1,%0\n" :
+ /* temp */ "=d"(d),
+ /* out */ "=f"(f):
+ /* in */ "r"(h));
+ return f;
+ }
+
+ static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
+ double d;
+ ggml_fp16_t r;
+ __asm__( /* xscvdphp can work on double or single precision */
+ "xscvdphp %0,%2\n"
+ "mffprd %1,%0\n" :
+ /* temp */ "=d"(d),
+ /* out */ "=r"(r):
+ /* in */ "f"(f));
+ return r;
+ }
+#elif defined(__riscv) && defined(__riscv_zfhmin)
+ static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
+ _Float16 hf;
+ memcpy(&hf, &h, sizeof(ggml_fp16_t));
+ return hf;
+ }
+
+ static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
+ ggml_fp16_t res;
+ _Float16 hf = (_Float16)f;
+ memcpy(&res, &hf, sizeof(ggml_fp16_t));
+ return res;
+ }
+
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
+#endif
+
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml-cpu.c, initialized in ggml_cpu_init()
+extern float ggml_table_f32_f16[1 << 16];
+
+// precomputed f32 table for e8m0 half (1 KB)
+// defined in ggml-cpu.c, initialized in ggml_cpu_init()
+extern float ggml_table_f32_e8m0_half[1 << 8];
+
+// Use lookup table for E8M0 on x86 (faster than bit manipulation)
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+#define GGML_CPU_E8M0_TO_FP32_HALF(x) ggml_table_f32_e8m0_half[(uint8_t)(x)]
+#else
+#define GGML_CPU_E8M0_TO_FP32_HALF(x) GGML_E8M0_TO_FP32_HALF(x)
+#endif
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_CPU_FP16_TO_FP32)
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+ uint16_t s;
+ memcpy(&s, &f, sizeof(uint16_t));
+ return ggml_table_f32_f16[s];
+}
+
+#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#endif
+
+#if !defined(GGML_CPU_FP32_TO_FP16)
+#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
+
+
+// we define a common set of C macros which map to specific intrinsics based on the current architecture
+// we then implement the fundamental computation operations below using only these macros
+// adding support for new architectures requires to define the corresponding SIMD macros
+//
+// GGML_F32_STEP / GGML_F16_STEP
+// number of elements to process in a single step
+//
+// GGML_F32_EPR / GGML_F16_EPR
+// number of elements to fit in a single register
+//
+
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
+
+#define GGML_SIMD
+
+// F32 SVE
+#define GGML_F32_EPR 8
+#define DEFAULT_PG svptrue_b32()
+
+#define GGML_F32xt svfloat32_t
+#define GGML_F32xt_ZERO svdup_n_f32(0.0f)
+#define GGML_F32xt_SET1(x) svdup_n_f32(x)
+#define GGML_F32xt_LOAD_IMPL(pg, a) svld1_f32(pg, a)
+#define GGML_F32xt_LOAD(a) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, a)
+#define GGML_F32xt_STORE_IMPL(pg, a, b) svst1_f32(pg, a, b)
+#define GGML_F32xt_STORE(a, b) GGML_F32xt_STORE_IMPL(DEFAULT_PG, a, b)
+#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
+#define GGML_F32xt_FMA(a, b, c) GGML_F32xt_FMA_IMPL(DEFAULT_PG, a, b, c)
+#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
+#define GGML_F32xt_ADD(a, b) GGML_F32xt_ADD_IMPL(DEFAULT_PG, a, b)
+#define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
+#define GGML_F32xt_MUL(a, b) GGML_F32xt_MUL_IMPL(DEFAULT_PG, a, b)
+#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
+#define GGML_F32xt_REDUCE_ONE(a) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, a)
+#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
+{ \
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
+ sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
+ sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
+ sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
+ sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
+ (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
+}
+#define GGML_F32xt_REDUCE(res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
+ GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)
+
+#define GGML_F32_VEC GGML_F32xt
+#define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32xt_SET1
+#define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
+#define GGML_F32_VEC_STORE GGML_F32xt_STORE
+#define GGML_F32_VEC_FMA GGML_F32xt_FMA
+#define GGML_F32_VEC_ADD GGML_F32xt_ADD
+#define GGML_F32_VEC_MUL GGML_F32xt_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
+
+// F16 SVE
+#define DEFAULT_PG32 svptrue_b32()
+#define DEFAULT_PG16 svptrue_b16()
+
+#define GGML_F32Cxt svfloat16_t
+#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
+#define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
+#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
+#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
+
+#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
+#define GGML_F32Cxt_FMA(a, b, c) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, a, b, c)
+#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
+#define GGML_F32Cxt_ADD(a, b) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, a, b)
+#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
+#define GGML_F32Cxt_MUL(a, b) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, a, b)
+#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
+
+#define GGML_F16x_VEC GGML_F32Cxt
+#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
+#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
+#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
+#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
+#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
+#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
+#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
+#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
+
+#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
+#define GGML_F16xt_REDUCE_ONE(a) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, a)
+
+#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
+{ \
+ sum1 = svadd_f16_x(pg16, sum1, sum2); \
+ sum3 = svadd_f16_x(pg16, sum3, sum4); \
+ sum1 = svadd_f16_x(pg16, sum1, sum3); \
+ __fp16 sum_f16 = svaddv_f16(pg16, sum1); \
+ (res) = (ggml_float) sum_f16; \
+}
+#define GGML_F16xt_REDUCE_MIXED(res, sum1, sum2, sum3, sum4) \
+ GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, res, sum1, sum2, sum3, sum4)
+
+// F16 NEON
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ #define GGML_F16_STEP 32
+ #define GGML_F16_EPR 8
+
+ #define GGML_F16x8 float16x8_t
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
+ #define GGML_F16x8_STORE vst1q_f16
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
+ #define GGML_F16x8_ADD vaddq_f16
+ #define GGML_F16x8_MUL vmulq_f16
+ #define GGML_F16x8_REDUCE(res, x) \
+ do { \
+ int offset = GGML_F16_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
+ } \
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
+ } while (0)
+
+ #define GGML_F16_VEC GGML_F16x8
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
+#else
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
+ // and take advantage of the vcvt_ functions to convert to/from FP16
+
+ #define GGML_F16_STEP 16
+ #define GGML_F16_EPR 4
+
+ #define GGML_F32Cx4 float32x4_t
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
+ #define GGML_F32Cx4_ADD vaddq_f32
+ #define GGML_F32Cx4_MUL vmulq_f32
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
+
+ #define GGML_F16_VEC GGML_F32Cx4
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
+#endif
+
+#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+
+#define GGML_SIMD
+
+// F32 NEON
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR 4
+
+#define GGML_F32x4 float32x4_t
+#define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
+#define GGML_F32x4_SET1(x) vdupq_n_f32(x)
+#define GGML_F32x4_LOAD vld1q_f32
+#define GGML_F32x4_STORE vst1q_f32
+#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
+#define GGML_F32x4_ADD vaddq_f32
+#define GGML_F32x4_MUL vmulq_f32
+#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+ } \
+ (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
+}
+
+#define GGML_F32_VEC GGML_F32x4
+#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 NEON
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ #define GGML_F16_STEP 32
+ #define GGML_F16_EPR 8
+
+ #define GGML_F16x8 float16x8_t
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
+ #define GGML_F16x8_STORE vst1q_f16
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
+ #define GGML_F16x8_ADD vaddq_f16
+ #define GGML_F16x8_MUL vmulq_f16
+ #define GGML_F16x8_REDUCE(res, x) \
+ do { \
+ int offset = GGML_F16_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
+ } \
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
+ } while (0)
+
+ #define GGML_F16_VEC GGML_F16x8
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
+#else
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
+ // and take advantage of the vcvt_ functions to convert to/from FP16
+
+ #define GGML_F16_STEP 16
+ #define GGML_F16_EPR 4
+
+ #define GGML_F32Cx4 float32x4_t
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
+ #define GGML_F32Cx4_ADD vaddq_f32
+ #define GGML_F32Cx4_MUL vmulq_f32
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
+
+ #define GGML_F16_VEC GGML_F32Cx4
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
+#endif
+
+#elif defined(__AVX512F__)
+
+#define GGML_SIMD
+
+// F32 AVX512
+
+#define GGML_F32_STEP 64
+#define GGML_F32_EPR 16
+
+#define GGML_F32x16 __m512
+#define GGML_F32x16_ZERO _mm512_setzero_ps()
+#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
+#define GGML_F32x16_LOAD _mm512_loadu_ps
+#define GGML_F32x16_STORE _mm512_storeu_ps
+// _mm512_fmadd_ps is defined in AVX512F so no guard is required
+#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32x16_ADD _mm512_add_ps
+#define GGML_F32x16_MUL _mm512_mul_ps
+#define GGML_F32x16_REDUCE(res, x) \
+do { \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
+} while (0)
+
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC GGML_F32x16
+#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x16_STORE
+#define GGML_F32_VEC_FMA GGML_F32x16_FMA
+#define GGML_F32_VEC_ADD GGML_F32x16_ADD
+#define GGML_F32_VEC_MUL GGML_F32x16_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
+
+// F16 AVX512
+
+// F16 AVX
+
+#define GGML_F16_STEP 64
+#define GGML_F16_EPR 16
+
+// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
+
+#define GGML_F32Cx16 __m512
+#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
+#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
+
+// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
+// so F16C guard isn't required
+#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
+#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
+
+#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32Cx16_ADD _mm512_add_ps
+#define GGML_F32Cx16_MUL _mm512_mul_ps
+#define GGML_F32Cx16_REDUCE(res, x) \
+do { \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
+ } \
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
+} while (0)
+
+#define GGML_F16_VEC GGML_F32Cx16
+#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
+#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
+#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
+#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
+#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
+
+#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
+#elif defined(__AVX__)
+
+#define GGML_SIMD
+
+// F32 AVX
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR 8
+
+#define GGML_F32x8 __m256
+#define GGML_F32x8_ZERO _mm256_setzero_ps()
+#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
+#define GGML_F32x8_LOAD _mm256_loadu_ps
+#define GGML_F32x8_STORE _mm256_storeu_ps
+#if defined(__FMA__)
+ #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
+#else
+ #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
+#endif
+#define GGML_F32x8_ADD _mm256_add_ps
+#define GGML_F32x8_MUL _mm256_mul_ps
+#define GGML_F32x8_REDUCE(res, x) \
+do { \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
+ } \
+ const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
+ _mm256_extractf128_ps(x[0], 1)); \
+ const __m128 t1 = _mm_hadd_ps(t0, t0); \
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
+} while (0)
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC GGML_F32x8
+#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x8_STORE
+#define GGML_F32_VEC_FMA GGML_F32x8_FMA
+#define GGML_F32_VEC_ADD GGML_F32x8_ADD
+#define GGML_F32_VEC_MUL GGML_F32x8_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
+
+// F16 AVX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR 8
+
+// F16 arithmetic is not supported by AVX, so we use F32 instead
+
+#define GGML_F32Cx8 __m256
+#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
+#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
+
+#if defined(__F16C__)
+// the _mm256_cvt intrinsics require F16C
+#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
+#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
+#else
+static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
+ float tmp[8];
+
+ for (int i = 0; i < 8; i++) {
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
+ }
+
+ return _mm256_loadu_ps(tmp);
+}
+static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
+ float arr[8];
+
+ _mm256_storeu_ps(arr, y);
+
+ for (int i = 0; i < 8; i++)
+ x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
+}
+#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
+#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
+#endif
+
+#define GGML_F32Cx8_FMA GGML_F32x8_FMA
+#define GGML_F32Cx8_ADD _mm256_add_ps
+#define GGML_F32Cx8_MUL _mm256_mul_ps
+#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
+
+#define GGML_F16_VEC GGML_F32Cx8
+#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
+#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
+#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
+#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
+#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_SIMD
+
+// F32 POWER9
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR 4
+
+#define GGML_F32x4 vector float
+#define GGML_F32x4_ZERO {0.0f}
+#define GGML_F32x4_SET1 vec_splats
+#define GGML_F32x4_LOAD(p) vec_xl(0, p)
+#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
+#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+#define GGML_F32x4_ADD vec_add
+#define GGML_F32x4_MUL vec_mul
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vec_add(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vec_add(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vec_add(x[i], x[offset+i]); \
+ } \
+ res = vec_extract(x[0], 0) + \
+ vec_extract(x[0], 1) + \
+ vec_extract(x[0], 2) + \
+ vec_extract(x[0], 3); \
+}
+#define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \
+{ \
+ vector float v = vec_add(vec_add(s0, s1), \
+ vec_add(s2, s3)); \
+ v = vec_add(v, vec_sld(v, v, 8)); \
+ v = vec_add(v, vec_sld(v, v, 4)); \
+ res += (ggml_float) vec_extract(v, 0); \
+}
+
+#define GGML_F32_VEC GGML_F32x4
+#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 POWER9
+#define GGML_F16_STEP GGML_F32_STEP
+#define GGML_F16_EPR GGML_F32_EPR
+#define GGML_F16_VEC GGML_F32x4
+#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F16_VEC_FMA GGML_F32x4_FMA
+#define GGML_F16_VEC_ADD GGML_F32x4_ADD
+#define GGML_F16_VEC_MUL GGML_F32x4_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
+// Use vec_xl, not vec_ld, in case the load address is not aligned.
+#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
+ vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
+ vec_extract_fp32_from_shortl(vec_xl(0, p))
+static inline unsigned char ggml_endian_byte(int i) {
+ uint16_t tmp_val = 1;
+ return ((unsigned char *)&tmp_val)[i];
+}
+#define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
+#define GGML_F16_VEC_STORE(p, r, i) \
+ if (i & 0x1) \
+ vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
+ r[i - GGML_ENDIAN_BYTE(0)]), \
+ 0, p - GGML_F16_EPR)
+
+//BF16 POWER9
+#define GGML_BF16_STEP 16
+#define GGML_BF16_EPR 8
+
+#define GGML_BF16x8 vector unsigned short
+#define GGML_BF16x8_ZERO vec_splats((unsigned short)0)
+#define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
+
+#define GGML_BF16_VEC GGML_BF16x8
+#define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO
+#define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD
+#if defined(__LITTLE_ENDIAN__)
+#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel(GGML_BF16_VEC_ZERO, (v)))
+#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh(GGML_BF16_VEC_ZERO, (v)))
+#else
+#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel((v), GGML_BF16_VEC_ZERO))
+#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh((v), GGML_BF16_VEC_ZERO))
+#endif
+#define GGML_BF16_FMA_LO(acc, x, y) \
+ (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
+#define GGML_BF16_FMA_HI(acc, x, y) \
+ (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
+
+#elif defined(__wasm_simd128__)
+
+#define GGML_SIMD
+
+// F32 WASM
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR 4
+
+#define GGML_F32x4 v128_t
+#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
+#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
+#define GGML_F32x4_LOAD wasm_v128_load
+#define GGML_F32x4_STORE wasm_v128_store
+#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
+#define GGML_F32x4_ADD wasm_f32x4_add
+#define GGML_F32x4_MUL wasm_f32x4_mul
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
+ } \
+ res = wasm_f32x4_extract_lane(x[0], 0) + \
+ wasm_f32x4_extract_lane(x[0], 1) + \
+ wasm_f32x4_extract_lane(x[0], 2) + \
+ wasm_f32x4_extract_lane(x[0], 3); \
+}
+
+#define GGML_F32_VEC GGML_F32x4
+#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 WASM
+
+#define GGML_F16_STEP 16
+#define GGML_F16_EPR 4
+
+inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
+ float tmp[4];
+
+ tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
+ tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
+ tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
+ tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
+
+ return wasm_v128_load(tmp);
+}
+
+inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
+ float tmp[4];
+
+ wasm_v128_store(tmp, x);
+
+ p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
+ p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
+ p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
+ p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
+}
+
+#define GGML_F16x4 v128_t
+#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
+#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
+#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
+#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
+#define GGML_F16x4_FMA GGML_F32x4_FMA
+#define GGML_F16x4_ADD wasm_f32x4_add
+#define GGML_F16x4_MUL wasm_f32x4_mul
+#define GGML_F16x4_REDUCE(res, x) \
+{ \
+ int offset = GGML_F16_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
+ } \
+ res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
+ wasm_f32x4_extract_lane(x[0], 1) + \
+ wasm_f32x4_extract_lane(x[0], 2) + \
+ wasm_f32x4_extract_lane(x[0], 3)); \
+}
+
+#define GGML_F16_VEC GGML_F16x4
+#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
+#define GGML_F16_VEC_SET1 GGML_F16x4_SET1
+#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA GGML_F16x4_FMA
+#define GGML_F16_VEC_ADD GGML_F16x4_ADD
+#define GGML_F16_VEC_MUL GGML_F16x4_MUL
+#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
+
+#elif defined(__SSE3__)
+
+#define GGML_SIMD
+
+// F32 SSE
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR 4
+
+#define GGML_F32x4 __m128
+#define GGML_F32x4_ZERO _mm_setzero_ps()
+#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
+#define GGML_F32x4_LOAD _mm_loadu_ps
+#define GGML_F32x4_STORE _mm_storeu_ps
+#if defined(__FMA__)
+ // TODO: Does this work?
+ #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
+#else
+ #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
+#endif
+#define GGML_F32x4_ADD _mm_add_ps
+#define GGML_F32x4_MUL _mm_mul_ps
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
+ } \
+ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
+}
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC GGML_F32x4
+#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 SSE
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR 4
+
+static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
+ float tmp[4];
+
+ tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
+ tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
+ tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
+ tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
+
+ return _mm_loadu_ps(tmp);
+}
+
+static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
+ float arr[4];
+
+ _mm_storeu_ps(arr, y);
+
+ x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
+ x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
+ x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
+ x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
+}
+
+#define GGML_F32Cx4 __m128
+#define GGML_F32Cx4_ZERO _mm_setzero_ps()
+#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
+#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
+#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
+#define GGML_F32Cx4_FMA GGML_F32x4_FMA
+#define GGML_F32Cx4_ADD _mm_add_ps
+#define GGML_F32Cx4_MUL _mm_mul_ps
+#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
+
+#define GGML_F16_VEC GGML_F32Cx4
+#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
+#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
+#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
+#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
+
+#elif defined(__loongarch_asx)
+
+#define GGML_SIMD
+
+// F32 LASX
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR 8
+
+#define GGML_F32x8 __m256
+#define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
+#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
+#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
+#define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
+#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
+#define GGML_F32x8_ADD __lasx_xvfadd_s
+#define GGML_F32x8_MUL __lasx_xvfmul_s
+#define GGML_F32x8_REDUCE(res, x) \
+do { \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
+ } \
+ float *tmp_p = (float *)&x[0]; \
+ res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
+} while (0)
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC GGML_F32x8
+#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x8_STORE
+#define GGML_F32_VEC_FMA GGML_F32x8_FMA
+#define GGML_F32_VEC_ADD GGML_F32x8_ADD
+#define GGML_F32_VEC_MUL GGML_F32x8_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
+
+// F16 LASX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR 8
+
+// F16 arithmetic is not supported by LASX, so we use F32 instead
+
+#define GGML_F32Cx8 __m256
+#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
+#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
+
+static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
+ __m256i a;
+ memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
+ a = __lasx_xvpermi_d(a, 0 | (1 << 4));
+ return __lasx_xvfcvtl_s_h(a);
+}
+
+static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
+ __m256i a = __lasx_xvfcvt_h_s(y, y);
+ a = __lasx_xvpermi_d(a, 0 | (2 << 2));
+ memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
+}
+#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
+#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
+
+#define GGML_F32Cx8_FMA GGML_F32x8_FMA
+#define GGML_F32Cx8_ADD __lasx_xvfadd_s
+#define GGML_F32Cx8_MUL __lasx_xvfmul_s
+#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
+
+#define GGML_F16_VEC GGML_F32Cx8
+#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
+#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
+#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
+#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
+#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
+
+#elif defined(__loongarch_sx)
+
+#define GGML_SIMD
+
+// F32 LSX
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR 4
+
+#define GGML_F32x4 __m128
+#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
+#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
+#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
+#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
+#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
+#define GGML_F32x4_ADD __lsx_vfadd_s
+#define GGML_F32x4_MUL __lsx_vfmul_s
+
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
+ } \
+ __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
+ __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
+ __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \
+ __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \
+ __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \
+ __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \
+ res = (ggml_float) ((v4f32)t5)[0]; \
+}
+
+#define GGML_F32_VEC GGML_F32x4
+#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 LSX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR 4
+
+static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
+ float tmp[4];
+
+ tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
+ tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
+ tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
+ tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
+
+ return (__m128)__lsx_vld(tmp, 0);
+}
+
+static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
+ float arr[4];
+
+ __lsx_vst(y, arr, 0);
+
+ x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
+ x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
+ x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
+ x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
+}
+
+#define GGML_F32Cx4 __m128
+#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
+#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
+#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
+#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
+#define GGML_F32Cx4_FMA GGML_F32x4_FMA
+#define GGML_F32Cx4_ADD __lsx_vfadd_s
+#define GGML_F32Cx4_MUL __lsx_vfmul_s
+#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
+
+#define GGML_F16_VEC GGML_F32Cx4
+#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
+#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
+#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
+#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
+
+#elif defined(__VXE__) || defined(__VXE2__)
+
+#define GGML_SIMD
+
+// F32 s390x
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR 4
+
+#define GGML_F32x4 float32x4_t
+#define GGML_F32x4_ZERO vec_splats(0.0f)
+#define GGML_F32x4_SET1 vec_splats
+#define GGML_F32x4_LOAD(p) vec_xl(0, p)
+#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
+#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+#define GGML_F32x4_ADD vec_add
+#define GGML_F32x4_MUL vec_mul
+#define GGML_F32x4_REDUCE(res, x) \
+{ \
+ int offset = GGML_F32_ARR >> 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vec_add(x[i], x[offset + i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vec_add(x[i], x[offset + i]); \
+ } \
+ offset >>= 1; \
+ for (int i = 0; i < offset; ++i) { \
+ x[i] = vec_add(x[i], x[offset + i]); \
+ } \
+ float32x4_t tmp = x[0] + vec_reve(x[0]); \
+ res = tmp[0] + tmp[1]; \
+}
+
+#define GGML_F32_VEC GGML_F32x4
+#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 s390x
+#define GGML_F16_STEP GGML_F32_STEP
+#define GGML_F16_EPR GGML_F32_EPR
+
+static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
+ float tmp[4];
+
+ for (int i = 0; i < 4; i++) {
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
+ }
+
+ // note: keep type-cast here to prevent compiler bugs
+ // see: https://github.com/ggml-org/llama.cpp/issues/12846
+ return vec_xl(0, (const float *)(tmp));
+}
+
+static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
+ float arr[4];
+
+ // note: keep type-cast here to prevent compiler bugs
+ // see: https://github.com/ggml-org/llama.cpp/issues/12846
+ vec_xst(v_y, 0, (float *)(arr));
+
+ for (int i = 0; i < 4; i++) {
+ x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
+ }
+}
+
+#define GGML_F16_VEC GGML_F32x4
+#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
+#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
+#define GGML_F16_VEC_FMA GGML_F32x4_FMA
+#define GGML_F16_VEC_ADD GGML_F32x4_ADD
+#define GGML_F16_VEC_MUL GGML_F32x4_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
+
+#elif defined(__riscv_v_intrinsic)
+
+// compatible with vlen >= 128
+
+#define GGML_SIMD
+
+// F32
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR 4
+
+#define GGML_F32x4 vfloat32m1_t
+#define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
+#define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
+#define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
+#define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
+#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
+#define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
+#define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
+
+#define GGML_F32_VEC GGML_F32x4
+#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+#endif
+
+// GGML_F32_ARR / GGML_F16_ARR
+// number of registers to use per step
+#ifdef GGML_SIMD
+#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
+#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
+#endif
+
+#ifdef __cplusplus
+}
+#endif