1#pragma once
2
3#include "ggml-cpu-impl.h"
4
5#ifdef __ARM_FEATURE_SVE
6#include <arm_sve.h>
7#endif // __ARM_FEATURE_SVE
8
9#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
10// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
11//
12// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
13//
14#include <arm_neon.h>
15#endif
16
17#if defined(__riscv_v_intrinsic)
18#include <riscv_vector.h>
19#endif
20
21#ifdef __cplusplus
22extern "C" {
23#endif
24
25//
26// simd mappings
27//
28
29// FP16 to FP32 conversion
30
31// 16-bit float
32// on Arm, we use __fp16
33// on x86, we use uint16_t
34//
35// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
36// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
37//
38#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
39 #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
40 #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
41
42 #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
43
44 static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
45 __fp16 tmp;
46 memcpy(&tmp, &h, sizeof(ggml_fp16_t));
47 return (float)tmp;
48 }
49
50 static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
51 ggml_fp16_t res;
52 __fp16 tmp = f;
53 memcpy(&res, &tmp, sizeof(ggml_fp16_t));
54 return res;
55 }
56#elif defined(__F16C__)
57 #ifdef _MSC_VER
58 #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
59 #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
60 #else
61 #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
62 #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
63 #endif
64#elif defined(__POWER9_VECTOR__)
65 #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
66 #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
67 /* the inline asm below is about 12% faster than the lookup method */
68 #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
69 #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
70
71 static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
72 float f;
73 double d;
74 __asm__(
75 "mtfprd %0,%2\n"
76 "xscvhpdp %0,%0\n"
77 "frsp %1,%0\n" :
78 /* temp */ "=d"(d),
79 /* out */ "=f"(f):
80 /* in */ "r"(h));
81 return f;
82 }
83
84 static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
85 double d;
86 ggml_fp16_t r;
87 __asm__( /* xscvdphp can work on double or single precision */
88 "xscvdphp %0,%2\n"
89 "mffprd %1,%0\n" :
90 /* temp */ "=d"(d),
91 /* out */ "=r"(r):
92 /* in */ "f"(f));
93 return r;
94 }
95#elif defined(__riscv) && defined(__riscv_zfhmin)
96 static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
97 _Float16 hf;
98 memcpy(&hf, &h, sizeof(ggml_fp16_t));
99 return hf;
100 }
101
102 static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
103 ggml_fp16_t res;
104 _Float16 hf = (_Float16)f;
105 memcpy(&res, &hf, sizeof(ggml_fp16_t));
106 return res;
107 }
108
109 #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
110 #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
111 #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
112 #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
113#endif
114
115// precomputed f32 table for f16 (256 KB)
116// defined in ggml-cpu.c, initialized in ggml_cpu_init()
117extern float ggml_table_f32_f16[1 << 16];
118
119// precomputed f32 table for e8m0 half (1 KB)
120// defined in ggml-cpu.c, initialized in ggml_cpu_init()
121extern float ggml_table_f32_e8m0_half[1 << 8];
122
123// Use lookup table for E8M0 on x86 (faster than bit manipulation)
124#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
125#define GGML_CPU_E8M0_TO_FP32_HALF(x) ggml_table_f32_e8m0_half[(uint8_t)(x)]
126#else
127#define GGML_CPU_E8M0_TO_FP32_HALF(x) GGML_E8M0_TO_FP32_HALF(x)
128#endif
129
130// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
131// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
132// This is also true for POWER9.
133#if !defined(GGML_CPU_FP16_TO_FP32)
134inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
135 uint16_t s;
136 memcpy(&s, &f, sizeof(uint16_t));
137 return ggml_table_f32_f16[s];
138}
139
140#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
141#endif
142
143#if !defined(GGML_CPU_FP32_TO_FP16)
144#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
145#endif
146
147
148// we define a common set of C macros which map to specific intrinsics based on the current architecture
149// we then implement the fundamental computation operations below using only these macros
150// adding support for new architectures requires to define the corresponding SIMD macros
151//
152// GGML_F32_STEP / GGML_F16_STEP
153// number of elements to process in a single step
154//
155// GGML_F32_EPR / GGML_F16_EPR
156// number of elements to fit in a single register
157//
158
159#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
160
161#define GGML_SIMD
162
163// F32 SVE
164#define GGML_F32_EPR 8
165#define DEFAULT_PG svptrue_b32()
166
167#define GGML_F32xt svfloat32_t
168#define GGML_F32xt_ZERO svdup_n_f32(0.0f)
169#define GGML_F32xt_SET1(x) svdup_n_f32(x)
170#define GGML_F32xt_LOAD_IMPL(pg, a) svld1_f32(pg, a)
171#define GGML_F32xt_LOAD(a) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, a)
172#define GGML_F32xt_STORE_IMPL(pg, a, b) svst1_f32(pg, a, b)
173#define GGML_F32xt_STORE(a, b) GGML_F32xt_STORE_IMPL(DEFAULT_PG, a, b)
174#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
175#define GGML_F32xt_FMA(a, b, c) GGML_F32xt_FMA_IMPL(DEFAULT_PG, a, b, c)
176#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
177#define GGML_F32xt_ADD(a, b) GGML_F32xt_ADD_IMPL(DEFAULT_PG, a, b)
178#define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
179#define GGML_F32xt_MUL(a, b) GGML_F32xt_MUL_IMPL(DEFAULT_PG, a, b)
180#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
181#define GGML_F32xt_REDUCE_ONE(a) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, a)
182#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
183{ \
184 sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
185 sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
186 sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
187 sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
188 sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
189 sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
190 sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
191 (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
192}
193#define GGML_F32xt_REDUCE(res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
194 GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)
195
196#define GGML_F32_VEC GGML_F32xt
197#define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
198#define GGML_F32_VEC_SET1 GGML_F32xt_SET1
199#define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
200#define GGML_F32_VEC_STORE GGML_F32xt_STORE
201#define GGML_F32_VEC_FMA GGML_F32xt_FMA
202#define GGML_F32_VEC_ADD GGML_F32xt_ADD
203#define GGML_F32_VEC_MUL GGML_F32xt_MUL
204#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
205
206// F16 SVE
207#define DEFAULT_PG32 svptrue_b32()
208#define DEFAULT_PG16 svptrue_b16()
209
210#define GGML_F32Cxt svfloat16_t
211#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
212#define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
213#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
214#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
215
216#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
217#define GGML_F32Cxt_FMA(a, b, c) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, a, b, c)
218#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
219#define GGML_F32Cxt_ADD(a, b) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, a, b)
220#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
221#define GGML_F32Cxt_MUL(a, b) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, a, b)
222#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
223
224#define GGML_F16x_VEC GGML_F32Cxt
225#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
226#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
227#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
228#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
229#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
230#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
231#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
232#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
233
234#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
235#define GGML_F16xt_REDUCE_ONE(a) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, a)
236
237#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
238{ \
239 sum1 = svadd_f16_x(pg16, sum1, sum2); \
240 sum3 = svadd_f16_x(pg16, sum3, sum4); \
241 sum1 = svadd_f16_x(pg16, sum1, sum3); \
242 __fp16 sum_f16 = svaddv_f16(pg16, sum1); \
243 (res) = (ggml_float) sum_f16; \
244}
245#define GGML_F16xt_REDUCE_MIXED(res, sum1, sum2, sum3, sum4) \
246 GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, res, sum1, sum2, sum3, sum4)
247
248// F16 NEON
249
250#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
251 #define GGML_F16_STEP 32
252 #define GGML_F16_EPR 8
253
254 #define GGML_F16x8 float16x8_t
255 #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
256 #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
257 #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
258 #define GGML_F16x8_STORE vst1q_f16
259 #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
260 #define GGML_F16x8_ADD vaddq_f16
261 #define GGML_F16x8_MUL vmulq_f16
262 #define GGML_F16x8_REDUCE(res, x) \
263 do { \
264 int offset = GGML_F16_ARR >> 1; \
265 for (int i = 0; i < offset; ++i) { \
266 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
267 } \
268 offset >>= 1; \
269 for (int i = 0; i < offset; ++i) { \
270 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
271 } \
272 offset >>= 1; \
273 for (int i = 0; i < offset; ++i) { \
274 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
275 } \
276 const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
277 const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
278 (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
279 } while (0)
280
281 #define GGML_F16_VEC GGML_F16x8
282 #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
283 #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
284 #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
285 #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
286 #define GGML_F16_VEC_FMA GGML_F16x8_FMA
287 #define GGML_F16_VEC_ADD GGML_F16x8_ADD
288 #define GGML_F16_VEC_MUL GGML_F16x8_MUL
289 #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
290#else
291 // if FP16 vector arithmetic is not supported, we use FP32 instead
292 // and take advantage of the vcvt_ functions to convert to/from FP16
293
294 #define GGML_F16_STEP 16
295 #define GGML_F16_EPR 4
296
297 #define GGML_F32Cx4 float32x4_t
298 #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
299 #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
300 #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
301 #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
302 #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
303 #define GGML_F32Cx4_ADD vaddq_f32
304 #define GGML_F32Cx4_MUL vmulq_f32
305 #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
306
307 #define GGML_F16_VEC GGML_F32Cx4
308 #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
309 #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
310 #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
311 #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
312 #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
313 #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
314 #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
315 #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
316#endif
317
318#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
319
320#define GGML_SIMD
321
322// F32 NEON
323
324#define GGML_F32_STEP 16
325#define GGML_F32_EPR 4
326
327#define GGML_F32x4 float32x4_t
328#define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
329#define GGML_F32x4_SET1(x) vdupq_n_f32(x)
330#define GGML_F32x4_LOAD vld1q_f32
331#define GGML_F32x4_STORE vst1q_f32
332#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
333#define GGML_F32x4_ADD vaddq_f32
334#define GGML_F32x4_MUL vmulq_f32
335#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
336#define GGML_F32x4_REDUCE(res, x) \
337{ \
338 int offset = GGML_F32_ARR >> 1; \
339 for (int i = 0; i < offset; ++i) { \
340 (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
341 } \
342 offset >>= 1; \
343 for (int i = 0; i < offset; ++i) { \
344 (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
345 } \
346 offset >>= 1; \
347 for (int i = 0; i < offset; ++i) { \
348 (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
349 } \
350 (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
351}
352
353#define GGML_F32_VEC GGML_F32x4
354#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
355#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
356#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
357#define GGML_F32_VEC_STORE GGML_F32x4_STORE
358#define GGML_F32_VEC_FMA GGML_F32x4_FMA
359#define GGML_F32_VEC_ADD GGML_F32x4_ADD
360#define GGML_F32_VEC_MUL GGML_F32x4_MUL
361#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
362
363// F16 NEON
364
365#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
366 #define GGML_F16_STEP 32
367 #define GGML_F16_EPR 8
368
369 #define GGML_F16x8 float16x8_t
370 #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
371 #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
372 #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
373 #define GGML_F16x8_STORE vst1q_f16
374 #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
375 #define GGML_F16x8_ADD vaddq_f16
376 #define GGML_F16x8_MUL vmulq_f16
377 #define GGML_F16x8_REDUCE(res, x) \
378 do { \
379 int offset = GGML_F16_ARR >> 1; \
380 for (int i = 0; i < offset; ++i) { \
381 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
382 } \
383 offset >>= 1; \
384 for (int i = 0; i < offset; ++i) { \
385 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
386 } \
387 offset >>= 1; \
388 for (int i = 0; i < offset; ++i) { \
389 (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
390 } \
391 const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
392 const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
393 (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
394 } while (0)
395
396 #define GGML_F16_VEC GGML_F16x8
397 #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
398 #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
399 #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
400 #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
401 #define GGML_F16_VEC_FMA GGML_F16x8_FMA
402 #define GGML_F16_VEC_ADD GGML_F16x8_ADD
403 #define GGML_F16_VEC_MUL GGML_F16x8_MUL
404 #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
405#else
406 // if FP16 vector arithmetic is not supported, we use FP32 instead
407 // and take advantage of the vcvt_ functions to convert to/from FP16
408
409 #define GGML_F16_STEP 16
410 #define GGML_F16_EPR 4
411
412 #define GGML_F32Cx4 float32x4_t
413 #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
414 #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
415 #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
416 #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
417 #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
418 #define GGML_F32Cx4_ADD vaddq_f32
419 #define GGML_F32Cx4_MUL vmulq_f32
420 #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
421
422 #define GGML_F16_VEC GGML_F32Cx4
423 #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
424 #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
425 #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
426 #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
427 #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
428 #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
429 #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
430 #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
431#endif
432
433#elif defined(__AVX512F__)
434
435#define GGML_SIMD
436
437// F32 AVX512
438
439#define GGML_F32_STEP 64
440#define GGML_F32_EPR 16
441
442#define GGML_F32x16 __m512
443#define GGML_F32x16_ZERO _mm512_setzero_ps()
444#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
445#define GGML_F32x16_LOAD _mm512_loadu_ps
446#define GGML_F32x16_STORE _mm512_storeu_ps
447// _mm512_fmadd_ps is defined in AVX512F so no guard is required
448#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
449#define GGML_F32x16_ADD _mm512_add_ps
450#define GGML_F32x16_MUL _mm512_mul_ps
451#define GGML_F32x16_REDUCE(res, x) \
452do { \
453 int offset = GGML_F32_ARR >> 1; \
454 for (int i = 0; i < offset; ++i) { \
455 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
456 } \
457 offset >>= 1; \
458 for (int i = 0; i < offset; ++i) { \
459 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
460 } \
461 offset >>= 1; \
462 for (int i = 0; i < offset; ++i) { \
463 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
464 } \
465 res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
466} while (0)
467
468// TODO: is this optimal ?
469
470#define GGML_F32_VEC GGML_F32x16
471#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
472#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
473#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
474#define GGML_F32_VEC_STORE GGML_F32x16_STORE
475#define GGML_F32_VEC_FMA GGML_F32x16_FMA
476#define GGML_F32_VEC_ADD GGML_F32x16_ADD
477#define GGML_F32_VEC_MUL GGML_F32x16_MUL
478#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
479
480// F16 AVX512
481
482// F16 AVX
483
484#define GGML_F16_STEP 64
485#define GGML_F16_EPR 16
486
487// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
488
489#define GGML_F32Cx16 __m512
490#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
491#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
492
493// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
494// so F16C guard isn't required
495#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
496#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
497
498#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
499#define GGML_F32Cx16_ADD _mm512_add_ps
500#define GGML_F32Cx16_MUL _mm512_mul_ps
501#define GGML_F32Cx16_REDUCE(res, x) \
502do { \
503 int offset = GGML_F32_ARR >> 1; \
504 for (int i = 0; i < offset; ++i) { \
505 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
506 } \
507 offset >>= 1; \
508 for (int i = 0; i < offset; ++i) { \
509 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
510 } \
511 offset >>= 1; \
512 for (int i = 0; i < offset; ++i) { \
513 x[i] = _mm512_add_ps(x[i], x[offset+i]); \
514 } \
515 res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
516} while (0)
517
518#define GGML_F16_VEC GGML_F32Cx16
519#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
520#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
521#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
522#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
523#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
524#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
525#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
526
527#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
528#elif defined(__AVX__)
529
530#define GGML_SIMD
531
532// F32 AVX
533
534#define GGML_F32_STEP 32
535#define GGML_F32_EPR 8
536
537#define GGML_F32x8 __m256
538#define GGML_F32x8_ZERO _mm256_setzero_ps()
539#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
540#define GGML_F32x8_LOAD _mm256_loadu_ps
541#define GGML_F32x8_STORE _mm256_storeu_ps
542#if defined(__FMA__)
543 #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
544#else
545 #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
546#endif
547#define GGML_F32x8_ADD _mm256_add_ps
548#define GGML_F32x8_MUL _mm256_mul_ps
549#define GGML_F32x8_REDUCE(res, x) \
550do { \
551 int offset = GGML_F32_ARR >> 1; \
552 for (int i = 0; i < offset; ++i) { \
553 x[i] = _mm256_add_ps(x[i], x[offset+i]); \
554 } \
555 offset >>= 1; \
556 for (int i = 0; i < offset; ++i) { \
557 x[i] = _mm256_add_ps(x[i], x[offset+i]); \
558 } \
559 offset >>= 1; \
560 for (int i = 0; i < offset; ++i) { \
561 x[i] = _mm256_add_ps(x[i], x[offset+i]); \
562 } \
563 const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
564 _mm256_extractf128_ps(x[0], 1)); \
565 const __m128 t1 = _mm_hadd_ps(t0, t0); \
566 res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
567} while (0)
568// TODO: is this optimal ?
569
570#define GGML_F32_VEC GGML_F32x8
571#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
572#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
573#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
574#define GGML_F32_VEC_STORE GGML_F32x8_STORE
575#define GGML_F32_VEC_FMA GGML_F32x8_FMA
576#define GGML_F32_VEC_ADD GGML_F32x8_ADD
577#define GGML_F32_VEC_MUL GGML_F32x8_MUL
578#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
579
580// F16 AVX
581
582#define GGML_F16_STEP 32
583#define GGML_F16_EPR 8
584
585// F16 arithmetic is not supported by AVX, so we use F32 instead
586
587#define GGML_F32Cx8 __m256
588#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
589#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
590
591#if defined(__F16C__)
592// the _mm256_cvt intrinsics require F16C
593#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
594#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
595#else
596static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
597 float tmp[8];
598
599 for (int i = 0; i < 8; i++) {
600 tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
601 }
602
603 return _mm256_loadu_ps(tmp);
604}
605static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
606 float arr[8];
607
608 _mm256_storeu_ps(arr, y);
609
610 for (int i = 0; i < 8; i++)
611 x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
612}
613#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
614#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
615#endif
616
617#define GGML_F32Cx8_FMA GGML_F32x8_FMA
618#define GGML_F32Cx8_ADD _mm256_add_ps
619#define GGML_F32Cx8_MUL _mm256_mul_ps
620#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
621
622#define GGML_F16_VEC GGML_F32Cx8
623#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
624#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
625#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
626#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
627#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
628#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
629#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
630#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
631
632#elif defined(__POWER9_VECTOR__)
633
634#define GGML_SIMD
635
636// F32 POWER9
637
638#define GGML_F32_STEP 32
639#define GGML_F32_EPR 4
640
641#define GGML_F32x4 vector float
642#define GGML_F32x4_ZERO {0.0f}
643#define GGML_F32x4_SET1 vec_splats
644#define GGML_F32x4_LOAD(p) vec_xl(0, p)
645#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
646#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
647#define GGML_F32x4_ADD vec_add
648#define GGML_F32x4_MUL vec_mul
649#define GGML_F32x4_REDUCE(res, x) \
650{ \
651 int offset = GGML_F32_ARR >> 1; \
652 for (int i = 0; i < offset; ++i) { \
653 x[i] = vec_add(x[i], x[offset+i]); \
654 } \
655 offset >>= 1; \
656 for (int i = 0; i < offset; ++i) { \
657 x[i] = vec_add(x[i], x[offset+i]); \
658 } \
659 offset >>= 1; \
660 for (int i = 0; i < offset; ++i) { \
661 x[i] = vec_add(x[i], x[offset+i]); \
662 } \
663 res = vec_extract(x[0], 0) + \
664 vec_extract(x[0], 1) + \
665 vec_extract(x[0], 2) + \
666 vec_extract(x[0], 3); \
667}
668#define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \
669{ \
670 vector float v = vec_add(vec_add(s0, s1), \
671 vec_add(s2, s3)); \
672 v = vec_add(v, vec_sld(v, v, 8)); \
673 v = vec_add(v, vec_sld(v, v, 4)); \
674 res += (ggml_float) vec_extract(v, 0); \
675}
676
677#define GGML_F32_VEC GGML_F32x4
678#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
679#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
680#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
681#define GGML_F32_VEC_STORE GGML_F32x4_STORE
682#define GGML_F32_VEC_FMA GGML_F32x4_FMA
683#define GGML_F32_VEC_ADD GGML_F32x4_ADD
684#define GGML_F32_VEC_MUL GGML_F32x4_MUL
685#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
686
687// F16 POWER9
688#define GGML_F16_STEP GGML_F32_STEP
689#define GGML_F16_EPR GGML_F32_EPR
690#define GGML_F16_VEC GGML_F32x4
691#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
692#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
693#define GGML_F16_VEC_FMA GGML_F32x4_FMA
694#define GGML_F16_VEC_ADD GGML_F32x4_ADD
695#define GGML_F16_VEC_MUL GGML_F32x4_MUL
696#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
697// Use vec_xl, not vec_ld, in case the load address is not aligned.
698#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
699 vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
700 vec_extract_fp32_from_shortl(vec_xl(0, p))
701static inline unsigned char ggml_endian_byte(int i) {
702 uint16_t tmp_val = 1;
703 return ((unsigned char *)&tmp_val)[i];
704}
705#define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
706#define GGML_F16_VEC_STORE(p, r, i) \
707 if (i & 0x1) \
708 vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
709 r[i - GGML_ENDIAN_BYTE(0)]), \
710 0, p - GGML_F16_EPR)
711
712//BF16 POWER9
713#define GGML_BF16_STEP 16
714#define GGML_BF16_EPR 8
715
716#define GGML_BF16x8 vector unsigned short
717#define GGML_BF16x8_ZERO vec_splats((unsigned short)0)
718#define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
719
720#define GGML_BF16_VEC GGML_BF16x8
721#define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO
722#define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD
723#if defined(__LITTLE_ENDIAN__)
724#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel(GGML_BF16_VEC_ZERO, (v)))
725#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh(GGML_BF16_VEC_ZERO, (v)))
726#else
727#define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel((v), GGML_BF16_VEC_ZERO))
728#define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh((v), GGML_BF16_VEC_ZERO))
729#endif
730#define GGML_BF16_FMA_LO(acc, x, y) \
731 (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
732#define GGML_BF16_FMA_HI(acc, x, y) \
733 (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
734
735#elif defined(__wasm_simd128__)
736
737#define GGML_SIMD
738
739// F32 WASM
740
741#define GGML_F32_STEP 16
742#define GGML_F32_EPR 4
743
744#define GGML_F32x4 v128_t
745#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
746#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
747#define GGML_F32x4_LOAD wasm_v128_load
748#define GGML_F32x4_STORE wasm_v128_store
749#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
750#define GGML_F32x4_ADD wasm_f32x4_add
751#define GGML_F32x4_MUL wasm_f32x4_mul
752#define GGML_F32x4_REDUCE(res, x) \
753{ \
754 int offset = GGML_F32_ARR >> 1; \
755 for (int i = 0; i < offset; ++i) { \
756 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
757 } \
758 offset >>= 1; \
759 for (int i = 0; i < offset; ++i) { \
760 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
761 } \
762 offset >>= 1; \
763 for (int i = 0; i < offset; ++i) { \
764 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
765 } \
766 res = wasm_f32x4_extract_lane(x[0], 0) + \
767 wasm_f32x4_extract_lane(x[0], 1) + \
768 wasm_f32x4_extract_lane(x[0], 2) + \
769 wasm_f32x4_extract_lane(x[0], 3); \
770}
771
772#define GGML_F32_VEC GGML_F32x4
773#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
774#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
775#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
776#define GGML_F32_VEC_STORE GGML_F32x4_STORE
777#define GGML_F32_VEC_FMA GGML_F32x4_FMA
778#define GGML_F32_VEC_ADD GGML_F32x4_ADD
779#define GGML_F32_VEC_MUL GGML_F32x4_MUL
780#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
781
782// F16 WASM
783
784#define GGML_F16_STEP 16
785#define GGML_F16_EPR 4
786
787inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
788 float tmp[4];
789
790 tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
791 tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
792 tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
793 tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
794
795 return wasm_v128_load(tmp);
796}
797
798inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
799 float tmp[4];
800
801 wasm_v128_store(tmp, x);
802
803 p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
804 p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
805 p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
806 p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
807}
808
809#define GGML_F16x4 v128_t
810#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
811#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
812#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
813#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
814#define GGML_F16x4_FMA GGML_F32x4_FMA
815#define GGML_F16x4_ADD wasm_f32x4_add
816#define GGML_F16x4_MUL wasm_f32x4_mul
817#define GGML_F16x4_REDUCE(res, x) \
818{ \
819 int offset = GGML_F16_ARR >> 1; \
820 for (int i = 0; i < offset; ++i) { \
821 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
822 } \
823 offset >>= 1; \
824 for (int i = 0; i < offset; ++i) { \
825 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
826 } \
827 offset >>= 1; \
828 for (int i = 0; i < offset; ++i) { \
829 x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
830 } \
831 res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
832 wasm_f32x4_extract_lane(x[0], 1) + \
833 wasm_f32x4_extract_lane(x[0], 2) + \
834 wasm_f32x4_extract_lane(x[0], 3)); \
835}
836
837#define GGML_F16_VEC GGML_F16x4
838#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
839#define GGML_F16_VEC_SET1 GGML_F16x4_SET1
840#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
841#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
842#define GGML_F16_VEC_FMA GGML_F16x4_FMA
843#define GGML_F16_VEC_ADD GGML_F16x4_ADD
844#define GGML_F16_VEC_MUL GGML_F16x4_MUL
845#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
846
847#elif defined(__SSE3__)
848
849#define GGML_SIMD
850
851// F32 SSE
852
853#define GGML_F32_STEP 32
854#define GGML_F32_EPR 4
855
856#define GGML_F32x4 __m128
857#define GGML_F32x4_ZERO _mm_setzero_ps()
858#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
859#define GGML_F32x4_LOAD _mm_loadu_ps
860#define GGML_F32x4_STORE _mm_storeu_ps
861#if defined(__FMA__)
862 // TODO: Does this work?
863 #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
864#else
865 #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
866#endif
867#define GGML_F32x4_ADD _mm_add_ps
868#define GGML_F32x4_MUL _mm_mul_ps
869#define GGML_F32x4_REDUCE(res, x) \
870{ \
871 int offset = GGML_F32_ARR >> 1; \
872 for (int i = 0; i < offset; ++i) { \
873 x[i] = _mm_add_ps(x[i], x[offset+i]); \
874 } \
875 offset >>= 1; \
876 for (int i = 0; i < offset; ++i) { \
877 x[i] = _mm_add_ps(x[i], x[offset+i]); \
878 } \
879 offset >>= 1; \
880 for (int i = 0; i < offset; ++i) { \
881 x[i] = _mm_add_ps(x[i], x[offset+i]); \
882 } \
883 const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
884 res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
885}
886// TODO: is this optimal ?
887
888#define GGML_F32_VEC GGML_F32x4
889#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
890#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
891#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
892#define GGML_F32_VEC_STORE GGML_F32x4_STORE
893#define GGML_F32_VEC_FMA GGML_F32x4_FMA
894#define GGML_F32_VEC_ADD GGML_F32x4_ADD
895#define GGML_F32_VEC_MUL GGML_F32x4_MUL
896#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
897
898// F16 SSE
899
900#define GGML_F16_STEP 32
901#define GGML_F16_EPR 4
902
903static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
904 float tmp[4];
905
906 tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
907 tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
908 tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
909 tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
910
911 return _mm_loadu_ps(tmp);
912}
913
914static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
915 float arr[4];
916
917 _mm_storeu_ps(arr, y);
918
919 x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
920 x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
921 x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
922 x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
923}
924
925#define GGML_F32Cx4 __m128
926#define GGML_F32Cx4_ZERO _mm_setzero_ps()
927#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
928#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
929#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
930#define GGML_F32Cx4_FMA GGML_F32x4_FMA
931#define GGML_F32Cx4_ADD _mm_add_ps
932#define GGML_F32Cx4_MUL _mm_mul_ps
933#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
934
935#define GGML_F16_VEC GGML_F32Cx4
936#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
937#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
938#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
939#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
940#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
941#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
942#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
943#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
944
945#elif defined(__loongarch_asx)
946
947#define GGML_SIMD
948
949// F32 LASX
950#define GGML_F32_STEP 32
951#define GGML_F32_EPR 8
952
953#define GGML_F32x8 __m256
954#define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
955#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
956#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
957#define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
958#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
959#define GGML_F32x8_ADD __lasx_xvfadd_s
960#define GGML_F32x8_MUL __lasx_xvfmul_s
961#define GGML_F32x8_REDUCE(res, x) \
962do { \
963 int offset = GGML_F32_ARR >> 1; \
964 for (int i = 0; i < offset; ++i) { \
965 x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
966 } \
967 offset >>= 1; \
968 for (int i = 0; i < offset; ++i) { \
969 x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
970 } \
971 offset >>= 1; \
972 for (int i = 0; i < offset; ++i) { \
973 x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
974 } \
975 float *tmp_p = (float *)&x[0]; \
976 res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
977} while (0)
978// TODO: is this optimal ?
979
980#define GGML_F32_VEC GGML_F32x8
981#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
982#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
983#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
984#define GGML_F32_VEC_STORE GGML_F32x8_STORE
985#define GGML_F32_VEC_FMA GGML_F32x8_FMA
986#define GGML_F32_VEC_ADD GGML_F32x8_ADD
987#define GGML_F32_VEC_MUL GGML_F32x8_MUL
988#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
989
990// F16 LASX
991
992#define GGML_F16_STEP 32
993#define GGML_F16_EPR 8
994
995// F16 arithmetic is not supported by LASX, so we use F32 instead
996
997#define GGML_F32Cx8 __m256
998#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
999#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
1000
1001static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
1002 __m256i a;
1003 memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
1004 a = __lasx_xvpermi_d(a, 0 | (1 << 4));
1005 return __lasx_xvfcvtl_s_h(a);
1006}
1007
1008static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1009 __m256i a = __lasx_xvfcvt_h_s(y, y);
1010 a = __lasx_xvpermi_d(a, 0 | (2 << 2));
1011 memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
1012}
1013#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
1014#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
1015
1016#define GGML_F32Cx8_FMA GGML_F32x8_FMA
1017#define GGML_F32Cx8_ADD __lasx_xvfadd_s
1018#define GGML_F32Cx8_MUL __lasx_xvfmul_s
1019#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
1020
1021#define GGML_F16_VEC GGML_F32Cx8
1022#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
1023#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
1024#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
1025#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
1026#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
1027#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
1028#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
1029#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
1030
1031#elif defined(__loongarch_sx)
1032
1033#define GGML_SIMD
1034
1035// F32 LSX
1036
1037#define GGML_F32_STEP 32
1038#define GGML_F32_EPR 4
1039
1040#define GGML_F32x4 __m128
1041#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
1042#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
1043#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
1044#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
1045#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
1046#define GGML_F32x4_ADD __lsx_vfadd_s
1047#define GGML_F32x4_MUL __lsx_vfmul_s
1048
1049#define GGML_F32x4_REDUCE(res, x) \
1050{ \
1051 int offset = GGML_F32_ARR >> 1; \
1052 for (int i = 0; i < offset; ++i) { \
1053 x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1054 } \
1055 offset >>= 1; \
1056 for (int i = 0; i < offset; ++i) { \
1057 x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1058 } \
1059 offset >>= 1; \
1060 for (int i = 0; i < offset; ++i) { \
1061 x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1062 } \
1063 __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
1064 __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
1065 __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \
1066 __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \
1067 __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \
1068 __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \
1069 res = (ggml_float) ((v4f32)t5)[0]; \
1070}
1071
1072#define GGML_F32_VEC GGML_F32x4
1073#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1074#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1075#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1076#define GGML_F32_VEC_STORE GGML_F32x4_STORE
1077#define GGML_F32_VEC_FMA GGML_F32x4_FMA
1078#define GGML_F32_VEC_ADD GGML_F32x4_ADD
1079#define GGML_F32_VEC_MUL GGML_F32x4_MUL
1080#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1081
1082// F16 LSX
1083
1084#define GGML_F16_STEP 32
1085#define GGML_F16_EPR 4
1086
1087static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
1088 float tmp[4];
1089
1090 tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
1091 tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
1092 tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
1093 tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
1094
1095 return (__m128)__lsx_vld(tmp, 0);
1096}
1097
1098static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1099 float arr[4];
1100
1101 __lsx_vst(y, arr, 0);
1102
1103 x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
1104 x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
1105 x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
1106 x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
1107}
1108
1109#define GGML_F32Cx4 __m128
1110#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
1111#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
1112#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
1113#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
1114#define GGML_F32Cx4_FMA GGML_F32x4_FMA
1115#define GGML_F32Cx4_ADD __lsx_vfadd_s
1116#define GGML_F32Cx4_MUL __lsx_vfmul_s
1117#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
1118
1119#define GGML_F16_VEC GGML_F32Cx4
1120#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
1121#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
1122#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
1123#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
1124#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
1125#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
1126#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
1127#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
1128
1129#elif defined(__VXE__) || defined(__VXE2__)
1130
1131#define GGML_SIMD
1132
1133// F32 s390x
1134
1135#define GGML_F32_STEP 32
1136#define GGML_F32_EPR 4
1137
1138#define GGML_F32x4 float32x4_t
1139#define GGML_F32x4_ZERO vec_splats(0.0f)
1140#define GGML_F32x4_SET1 vec_splats
1141#define GGML_F32x4_LOAD(p) vec_xl(0, p)
1142#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
1143#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
1144#define GGML_F32x4_ADD vec_add
1145#define GGML_F32x4_MUL vec_mul
1146#define GGML_F32x4_REDUCE(res, x) \
1147{ \
1148 int offset = GGML_F32_ARR >> 1; \
1149 for (int i = 0; i < offset; ++i) { \
1150 x[i] = vec_add(x[i], x[offset + i]); \
1151 } \
1152 offset >>= 1; \
1153 for (int i = 0; i < offset; ++i) { \
1154 x[i] = vec_add(x[i], x[offset + i]); \
1155 } \
1156 offset >>= 1; \
1157 for (int i = 0; i < offset; ++i) { \
1158 x[i] = vec_add(x[i], x[offset + i]); \
1159 } \
1160 float32x4_t tmp = x[0] + vec_reve(x[0]); \
1161 res = tmp[0] + tmp[1]; \
1162}
1163
1164#define GGML_F32_VEC GGML_F32x4
1165#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1166#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1167#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1168#define GGML_F32_VEC_STORE GGML_F32x4_STORE
1169#define GGML_F32_VEC_FMA GGML_F32x4_FMA
1170#define GGML_F32_VEC_ADD GGML_F32x4_ADD
1171#define GGML_F32_VEC_MUL GGML_F32x4_MUL
1172#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1173
1174// F16 s390x
1175#define GGML_F16_STEP GGML_F32_STEP
1176#define GGML_F16_EPR GGML_F32_EPR
1177
1178static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1179 float tmp[4];
1180
1181 for (int i = 0; i < 4; i++) {
1182 tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
1183 }
1184
1185 // note: keep type-cast here to prevent compiler bugs
1186 // see: https://github.com/ggml-org/llama.cpp/issues/12846
1187 return vec_xl(0, (const float *)(tmp));
1188}
1189
1190static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1191 float arr[4];
1192
1193 // note: keep type-cast here to prevent compiler bugs
1194 // see: https://github.com/ggml-org/llama.cpp/issues/12846
1195 vec_xst(v_y, 0, (float *)(arr));
1196
1197 for (int i = 0; i < 4; i++) {
1198 x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
1199 }
1200}
1201
1202#define GGML_F16_VEC GGML_F32x4
1203#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
1204#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
1205#define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
1206#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
1207#define GGML_F16_VEC_FMA GGML_F32x4_FMA
1208#define GGML_F16_VEC_ADD GGML_F32x4_ADD
1209#define GGML_F16_VEC_MUL GGML_F32x4_MUL
1210#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
1211
1212#elif defined(__riscv_v_intrinsic)
1213
1214// compatible with vlen >= 128
1215
1216#define GGML_SIMD
1217
1218// F32
1219
1220#define GGML_F32_STEP 16
1221#define GGML_F32_EPR 4
1222
1223#define GGML_F32x4 vfloat32m1_t
1224#define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
1225#define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
1226#define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
1227#define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
1228#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
1229#define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
1230#define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
1231
1232#define GGML_F32_VEC GGML_F32x4
1233#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1234#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1235#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1236#define GGML_F32_VEC_STORE GGML_F32x4_STORE
1237#define GGML_F32_VEC_FMA GGML_F32x4_FMA
1238#define GGML_F32_VEC_ADD GGML_F32x4_ADD
1239#define GGML_F32_VEC_MUL GGML_F32x4_MUL
1240#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1241
1242#endif
1243
1244// GGML_F32_ARR / GGML_F16_ARR
1245// number of registers to use per step
1246#ifdef GGML_SIMD
1247#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
1248#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
1249#endif
1250
1251#ifdef __cplusplus
1252}
1253#endif