summaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c')
-rw-r--r--llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c3820
1 files changed, 3820 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c b/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c
new file mode 100644
index 0000000..74d699f
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -0,0 +1,3820 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+#include "ggml-quants.h"
+#include "ggml-impl.h"
+#include "ggml-cpu.h"
+#include "simd-mappings.h"
+
+#include "../../quants.h"
+#include "../../ggml-cpu-impl.h"
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h> // for qsort
+#include <stdio.h> // for GGML_ASSERT
+
+#define GROUP_MAX_EPS 1e-15f
+#define GROUP_MAX_EPS_IQ3_XXS 1e-8f
+#define GROUP_MAX_EPS_IQ2_S 1e-8f
+#define GROUP_MAX_EPS_IQ1_M 1e-7f
+#define GROUP_MAX_EPS_IQ1_S 1e-12f
+
+#define UNUSED GGML_UNUSED
+
+// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+// multiply int8_t, add results pairwise twice
+static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
+ // Get absolute values of x vectors
+ const __m128i ax = _mm_sign_epi8(x, x);
+ // Sign the values of the y vectors
+ const __m128i sy = _mm_sign_epi8(y, x);
+ // Perform multiplication and create 16-bit values
+ const __m128i dot = _mm_maddubs_epi16(ax, sy);
+ const __m128i ones = _mm_set1_epi16(1);
+ return _mm_madd_epi16(ones, dot);
+}
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+ __m128 res = _mm256_extractf128_ps(x, 1);
+ res = _mm_add_ps(res, _mm256_castps256_ps128(x));
+ res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+ res = _mm_add_ss(res, _mm_movehdup_ps(res));
+ return _mm_cvtss_f32(res);
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+ const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+ const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+ const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+ const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+ return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+ const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+ const __m128i sum64 = _mm_add_epi32(hi64, a);
+ const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+ return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
+#if defined(__AVX2__) || defined(__AVX512F__)
+static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
+ const __m256i ax = _mm256_sign_epi8(x, x);
+ const __m256i sy = _mm256_sign_epi8(y, x);
+ return _mm256_maddubs_epi16(ax, sy);
+}
+
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+ uint32_t x32;
+ memcpy(&x32, x, sizeof(uint32_t));
+ const __m256i shuf_mask = _mm256_set_epi64x(
+ 0x0303030303030303, 0x0202020202020202,
+ 0x0101010101010101, 0x0000000000000000);
+ __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
+ const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
+ bytes = _mm256_or_si256(bytes, bit_mask);
+ return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+ const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+ const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
+ const __m256i lowMask = _mm256_set1_epi8( 0xF );
+ return _mm256_and_si256(lowMask, bytes);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+ const __m256i ones = _mm256_set1_epi16(1);
+ const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+ return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
+ return _mm256_cvtepi32_ps(summed_pairs);
+#elif defined(__AVXVNNI__)
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
+ return _mm256_cvtepi32_ps(summed_pairs);
+#else
+ // Perform multiplication and create 16-bit values
+ const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+ return sum_i16_pairs_float(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
+ return _mm256_cvtepi32_ps(summed_pairs);
+#else
+ // Get absolute values of x vectors
+ const __m256i ax = _mm256_sign_epi8(x, x);
+ // Sign the values of the y vectors
+ const __m256i sy = _mm256_sign_epi8(y, x);
+ return mul_sum_us8_pairs_float(ax, sy);
+#endif
+}
+
+static inline __m128i packNibbles( __m256i bytes )
+{
+ // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+#if __AVX512F__
+ const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000
+ bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh
+ return _mm256_cvtepi16_epi8(bytes); // abcd_efgh
+#else
+ const __m256i lowByte = _mm256_set1_epi16( 0xFF );
+ __m256i high = _mm256_andnot_si256( lowByte, bytes );
+ __m256i low = _mm256_and_si256( lowByte, bytes );
+ high = _mm256_srli_epi16( high, 4 );
+ bytes = _mm256_or_si256( low, high );
+
+ // Compress uint16_t lanes into bytes
+ __m128i r0 = _mm256_castsi256_si128( bytes );
+ __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
+ return _mm_packus_epi16( r0, r1 );
+#endif
+}
+#elif defined(__AVX__)
+static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
+{
+ // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+ const __m128i lowByte = _mm_set1_epi16( 0xFF );
+ __m128i high = _mm_andnot_si128( lowByte, bytes1 );
+ __m128i low = _mm_and_si128( lowByte, bytes1 );
+ high = _mm_srli_epi16( high, 4 );
+ bytes1 = _mm_or_si128( low, high );
+ high = _mm_andnot_si128( lowByte, bytes2 );
+ low = _mm_and_si128( lowByte, bytes2 );
+ high = _mm_srli_epi16( high, 4 );
+ bytes2 = _mm_or_si128( low, high );
+
+ return _mm_packus_epi16( bytes1, bytes2);
+}
+
+static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
+ const __m128i ax = _mm_sign_epi8(x, x);
+ const __m128i sy = _mm_sign_epi8(y, x);
+ return _mm_maddubs_epi16(ax, sy);
+}
+
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+ uint32_t x32;
+ memcpy(&x32, x, sizeof(uint32_t));
+ const __m128i shuf_maskl = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+ const __m128i shuf_maskh = _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
+ __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
+ __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
+ const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
+ bytesl = _mm_or_si128(bytesl, bit_mask);
+ bytesh = _mm_or_si128(bytesh, bit_mask);
+ bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
+ bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
+ return MM256_SET_M128I(bytesh, bytesl);
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
+{
+ // Load 16 bytes from memory
+ __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
+ __m128i tmph = _mm_srli_epi16(tmpl, 4);
+ const __m128i lowMask = _mm_set1_epi8(0xF);
+ tmpl = _mm_and_si128(lowMask, tmpl);
+ tmph = _mm_and_si128(lowMask, tmph);
+ return MM256_SET_M128I(tmph, tmpl);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
+ const __m128i ones = _mm_set1_epi16(1);
+ const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
+ const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
+ const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
+ return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
+ const __m128i axl = _mm256_castsi256_si128(ax);
+ const __m128i axh = _mm256_extractf128_si256(ax, 1);
+ const __m128i syl = _mm256_castsi256_si128(sy);
+ const __m128i syh = _mm256_extractf128_si256(sy, 1);
+ // Perform multiplication and create 16-bit values
+ const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+ const __m128i doth = _mm_maddubs_epi16(axh, syh);
+ return sum_i16_pairs_float(doth, dotl);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+ const __m128i xl = _mm256_castsi256_si128(x);
+ const __m128i xh = _mm256_extractf128_si256(x, 1);
+ const __m128i yl = _mm256_castsi256_si128(y);
+ const __m128i yh = _mm256_extractf128_si256(y, 1);
+ // Get absolute values of x vectors
+ const __m128i axl = _mm_sign_epi8(xl, xl);
+ const __m128i axh = _mm_sign_epi8(xh, xh);
+ // Sign the values of the y vectors
+ const __m128i syl = _mm_sign_epi8(yl, xl);
+ const __m128i syh = _mm_sign_epi8(yh, xh);
+ // Perform multiplication and create 16-bit values
+ const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+ const __m128i doth = _mm_maddubs_epi16(axh, syh);
+ return sum_i16_pairs_float(doth, dotl);
+}
+
+// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
+static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
+ const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
+ const __m128i mone = _mm_set1_epi16(1);
+
+ const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
+ const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
+ const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
+ const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
+ const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
+ const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
+ const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
+ const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
+ const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
+ const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
+ return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
+}
+
+// quad fp16 delta calculation
+static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
+ // GGML_CPU_FP16_TO_FP32 is faster than Intel F16C
+ return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)),
+ _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
+}
+
+static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const uint8_t x1, const float y1) {
+ return _mm256_set_m128(_mm_set1_ps(GGML_CPU_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)),
+ _mm_set1_ps(GGML_CPU_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0)));
+}
+#endif
+#elif defined(__SSSE3__)
+// horizontally add 4x4 floats
+static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
+ __m128 res_0 =_mm_hadd_ps(a, b);
+ __m128 res_1 =_mm_hadd_ps(c, d);
+ __m128 res =_mm_hadd_ps(res_0, res_1);
+ res =_mm_hadd_ps(res, res);
+ res =_mm_hadd_ps(res, res);
+
+ return _mm_cvtss_f32(res);
+}
+#endif // __AVX__ || __AVX2__ || __AVX512F__
+#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
+
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+ assert(QK8_0 == 32);
+ assert(k % QK8_0 == 0);
+ const int nb = k / QK8_0;
+
+ block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__AVX2__) || defined(__AVX__)
+ for (int i = 0; i < nb; i++) {
+ // Load elements into 4 AVX vectors
+ __m256 v0 = _mm256_loadu_ps( x );
+ __m256 v1 = _mm256_loadu_ps( x + 8 );
+ __m256 v2 = _mm256_loadu_ps( x + 16 );
+ __m256 v3 = _mm256_loadu_ps( x + 24 );
+ x += 32;
+
+ // Compute max(abs(e)) for the block
+ const __m256 signBit = _mm256_set1_ps( -0.0f );
+ __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+ __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+ max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+ max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+ const float maxScalar = _mm_cvtss_f32( max4 );
+
+ // Quantize these floats
+ const float d = maxScalar / 127.f;
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
+ const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+ const __m256 mul = _mm256_set1_ps( id );
+
+ // Apply the multiplier
+ v0 = _mm256_mul_ps( v0, mul );
+ v1 = _mm256_mul_ps( v1, mul );
+ v2 = _mm256_mul_ps( v2, mul );
+ v3 = _mm256_mul_ps( v3, mul );
+
+ // Round to nearest integer
+ v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+ v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+ v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+ v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+ // Convert floats to integers
+ __m256i i0 = _mm256_cvtps_epi32( v0 );
+ __m256i i1 = _mm256_cvtps_epi32( v1 );
+ __m256i i2 = _mm256_cvtps_epi32( v2 );
+ __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+ // Convert int32 to int16
+ i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
+ i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
+ // Convert int16 to int8
+ i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+ // We got our precious signed bytes, but the order is now wrong
+ // These AVX2 pack instructions process 16-byte pieces independently
+ // The following instruction is fixing the order
+ const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+ i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+ _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+ // Since we don't have in AVX some necessary functions,
+ // we split the registers in half and call AVX2 analogs from SSE
+ __m128i ni0 = _mm256_castsi256_si128( i0 );
+ __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+ __m128i ni2 = _mm256_castsi256_si128( i1 );
+ __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+ __m128i ni4 = _mm256_castsi256_si128( i2 );
+ __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+ __m128i ni6 = _mm256_castsi256_si128( i3 );
+ __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+ // Convert int32 to int16
+ ni0 = _mm_packs_epi32( ni0, ni1 );
+ ni2 = _mm_packs_epi32( ni2, ni3 );
+ ni4 = _mm_packs_epi32( ni4, ni5 );
+ ni6 = _mm_packs_epi32( ni6, ni7 );
+ // Convert int16 to int8
+ ni0 = _mm_packs_epi16( ni0, ni2 );
+ ni4 = _mm_packs_epi16( ni4, ni6 );
+
+ _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
+ _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+ }
+#else
+ GGML_UNUSED(nb);
+ // scalar
+ quantize_row_q8_0_ref(x, y, k);
+#endif
+}
+
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+ assert(k % QK8_1 == 0);
+ const int nb = k / QK8_1;
+
+ block_q8_1 * GGML_RESTRICT y = vy;
+#if defined(__AVX2__) || defined(__AVX__)
+ for (int i = 0; i < nb; i++) {
+ // Load elements into 4 AVX vectors
+ __m256 v0 = _mm256_loadu_ps( x );
+ __m256 v1 = _mm256_loadu_ps( x + 8 );
+ __m256 v2 = _mm256_loadu_ps( x + 16 );
+ __m256 v3 = _mm256_loadu_ps( x + 24 );
+ x += 32;
+
+ // Compute max(abs(e)) for the block
+ const __m256 signBit = _mm256_set1_ps( -0.0f );
+ __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+ __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+ max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+ max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+ const float max_scalar = _mm_cvtss_f32( max4 );
+
+ // Quantize these floats
+ const float d = max_scalar / 127.f;
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
+ const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
+ const __m256 mul = _mm256_set1_ps( id );
+
+ // Apply the multiplier
+ v0 = _mm256_mul_ps( v0, mul );
+ v1 = _mm256_mul_ps( v1, mul );
+ v2 = _mm256_mul_ps( v2, mul );
+ v3 = _mm256_mul_ps( v3, mul );
+
+ // Round to nearest integer
+ v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+ v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+ v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+ v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+ // Convert floats to integers
+ __m256i i0 = _mm256_cvtps_epi32( v0 );
+ __m256i i1 = _mm256_cvtps_epi32( v1 );
+ __m256i i2 = _mm256_cvtps_epi32( v2 );
+ __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+ // Compute the sum of the quants and set y[i].s
+ y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
+
+ // Convert int32 to int16
+ i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
+ i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
+ // Convert int16 to int8
+ i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+ // We got our precious signed bytes, but the order is now wrong
+ // These AVX2 pack instructions process 16-byte pieces independently
+ // The following instruction is fixing the order
+ const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+ i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+ _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+ // Since we don't have in AVX some necessary functions,
+ // we split the registers in half and call AVX2 analogs from SSE
+ __m128i ni0 = _mm256_castsi256_si128( i0 );
+ __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+ __m128i ni2 = _mm256_castsi256_si128( i1 );
+ __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+ __m128i ni4 = _mm256_castsi256_si128( i2 );
+ __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+ __m128i ni6 = _mm256_castsi256_si128( i3 );
+ __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+ // Compute the sum of the quants and set y[i].s
+ const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
+ const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
+ y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
+
+ // Convert int32 to int16
+ ni0 = _mm_packs_epi32( ni0, ni1 );
+ ni2 = _mm_packs_epi32( ni2, ni3 );
+ ni4 = _mm_packs_epi32( ni4, ni5 );
+ ni6 = _mm_packs_epi32( ni6, ni7 );
+ // Convert int16 to int8
+ ni0 = _mm_packs_epi16( ni0, ni2 );
+ ni4 = _mm_packs_epi16( ni4, ni6 );
+
+ _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
+ _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+ }
+#else
+ GGML_UNUSED(nb);
+ // scalar
+ quantize_row_q8_1_ref(x, y, k);
+#endif
+}
+
+// placeholder implementation for Apple targets
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+ quantize_row_q8_K_ref(x, y, k);
+}
+
+//===================================== Dot products =================================
+
+//
+// Helper functions
+//
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+
+// shuffles to pick the required scales in dot products
+static inline __m256i get_scale_shuffle_q3k(int i) {
+ static const uint8_t k_shuffle[128] = {
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+ 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+ 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
+ };
+ return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m256i get_scale_shuffle_k4(int i) {
+ static const uint8_t k_shuffle[256] = {
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+ 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+ 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
+ 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
+ 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
+ 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
+ };
+ return _mm256_loadu_si256((const __m256i*)k_shuffle + i);
+}
+static inline __m128i get_scale_shuffle(int i) {
+ static const uint8_t k_shuffle[128] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+ 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
+ 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
+ 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
+ };
+ return _mm_loadu_si128((const __m128i*)k_shuffle + i);
+}
+#endif
+
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+
+ assert(n % qk == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_q4_0 * GGML_RESTRICT x = vx;
+ const block_q8_0 * GGML_RESTRICT y = vy;
+
+ int ib = 0;
+ float sumf = 0;
+
+#if defined(__AVX2__)
+ // Initialize accumulator with zeros
+ __m256 acc = _mm256_setzero_ps();
+
+ // Main loop
+ for (; ib < nb; ++ib) {
+ /* Compute combined scale for the block */
+ const __m256 d = _mm256_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
+
+ __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+
+ // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+ const __m256i off = _mm256_set1_epi8( 8 );
+ qx = _mm256_sub_epi8( qx, off );
+
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+ /* Multiply q with scale and accumulate */
+ acc = _mm256_fmadd_ps( d, q, acc );
+ }
+
+ sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+ __m256 accum = _mm256_setzero_ps();
+ for (; ib + 1 < nb; ib += 2) {
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+ const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
+ const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
+ const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
+ const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
+
+ const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
+ const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
+ const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
+ const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
+ const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
+ const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
+ const __m256 p = sum_i16_pairs_float(p_2, p_1);
+
+ const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+ }
+
+ sumf = hsum_float_8(accum);
+#elif defined(__SSSE3__)
+ // set constants
+ const __m128i lowMask = _mm_set1_epi8(0xF);
+ const __m128i off = _mm_set1_epi8(8);
+
+ // Initialize accumulator with zeros
+ __m128 acc_0 = _mm_setzero_ps();
+ __m128 acc_1 = _mm_setzero_ps();
+ __m128 acc_2 = _mm_setzero_ps();
+ __m128 acc_3 = _mm_setzero_ps();
+
+ for (; ib + 1 < nb; ib += 2) {
+ _mm_prefetch(&x[ib] + sizeof(block_q4_0), _MM_HINT_T0);
+ _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
+
+ // Compute combined scale for the block 0 and 1
+ const __m128 d_0_1 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
+
+ const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
+
+ __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+ __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
+ bx_0 = _mm_sub_epi8(bx_0, off);
+ const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+ __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+ __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
+ bx_1 = _mm_sub_epi8(bx_1, off);
+ const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+ _mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+ _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
+ // Compute combined scale for the block 2 and 3
+ const __m128 d_2_3 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
+
+ const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+
+ __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+ __m128i by_2 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+ bx_2 = _mm_sub_epi8(bx_2, off);
+ const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+ __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+ __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[ib + 1].qs + 16));
+ bx_3 = _mm_sub_epi8(bx_3, off);
+ const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+ // Convert int32_t to float
+ __m128 p0 = _mm_cvtepi32_ps(i32_0);
+ __m128 p1 = _mm_cvtepi32_ps(i32_1);
+ __m128 p2 = _mm_cvtepi32_ps(i32_2);
+ __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+ // Apply the scale
+ __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
+ __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
+ __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
+ __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
+
+ // Acummulate
+ acc_0 = _mm_add_ps(p0_d, acc_0);
+ acc_1 = _mm_add_ps(p1_d, acc_1);
+ acc_2 = _mm_add_ps(p2_d, acc_2);
+ acc_3 = _mm_add_ps(p3_d, acc_3);
+ }
+
+ sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+
+#endif
+ for (; ib < nb; ++ib) {
+ int sumi0 = 0;
+ int sumi1 = 0;
+
+ for (int j = 0; j < qk/2; ++j) {
+ const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+ const int v1 = (x[ib].qs[j] >> 4) - 8;
+
+ sumi0 += (v0 * y[ib].qs[j]);
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
+ }
+
+ int sumi = sumi0 + sumi1;
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
+ }
+
+ *s = sumf;
+}
+
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ const int qk = QK8_1;
+ const int nb = n / qk;
+
+ assert(n % qk == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_q4_1 * GGML_RESTRICT x = vx;
+ const block_q8_1 * GGML_RESTRICT y = vy;
+
+ int ib = 0;
+
+#if defined(__AVX2__) || defined(__AVX__)
+ // Initialize accumulator with zeros
+ __m256 acc = _mm256_setzero_ps();
+
+ float summs = 0;
+
+ // Main loop
+ for (; ib < nb; ++ib) {
+ const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
+ const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
+
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+
+ const __m256 d0v = _mm256_set1_ps( d0 );
+ const __m256 d1v = _mm256_set1_ps( d1 );
+
+ // Compute combined scales
+ const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
+
+ // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+ const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+ const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs );
+
+ const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
+
+ // Accumulate d0*d1*x*y
+#if defined(__AVX2__)
+ acc = _mm256_fmadd_ps( d0d1, xy, acc );
+#else
+ acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
+#endif
+ }
+
+ *s = hsum_float_8(acc) + summs;
+#else
+ UNUSED(nb);
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(ib);
+ ggml_vec_dot_q4_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+ assert(n % QK_MXFP4 == 0);
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
+
+ const block_mxfp4 * GGML_RESTRICT x = vx;
+ const block_q8_0 * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_MXFP4;
+
+ int ib = 0;
+ float sumf = 0;
+
+#if defined __AVX2__
+
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
+ const __m128i m4b = _mm_set1_epi8(0x0f);
+ const __m256i mone = _mm256_set1_epi16(1);
+
+ __m256 accum1 = _mm256_setzero_ps();
+ __m256 accum2 = _mm256_setzero_ps();
+
+ for (; ib + 1 < nb; ib += 2) {
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
+ const __m256 scale0 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib + 0].e));
+ const __m256 scale1 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib + 1].e));
+ accum1 = _mm256_fmadd_ps(scale0, _mm256_cvtepi32_ps(p_1), accum1);
+ accum2 = _mm256_fmadd_ps(scale1, _mm256_cvtepi32_ps(p_2), accum2);
+ }
+
+ sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+
+#elif defined __AVX__
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_mxfp4);
+ const __m128i m4b = _mm_set1_epi8(0x0f);
+
+ __m256 accum = _mm256_setzero_ps();
+ for (; ib + 1 < nb; ib += 2) {
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+ const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
+ const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
+ const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
+ const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
+
+ const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
+ const __m256 deltas = quad_mx_delta_float(x[ib].e, y[ib].d, x[ib + 1].e, y[ib + 1].d);
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+ }
+
+ sumf = hsum_float_8(accum);
+
+#endif
+ for (; ib < nb; ++ib) {
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib].e);
+ int sumi1 = 0;
+ int sumi2 = 0;
+ for (int j = 0; j < QK_MXFP4/2; ++j) {
+ sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
+ sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
+ }
+ sumf += d * (sumi1 + sumi2);
+ }
+ *s = sumf;
+}
+
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+
+ int ib = 0;
+
+ assert(n % qk == 0);
+ assert(qk == QK5_0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_q5_0 * GGML_RESTRICT x = vx;
+ const block_q8_0 * GGML_RESTRICT y = vy;
+
+#if defined(__AVX2__)
+ // Initialize accumulator with zeros
+ __m256 acc = _mm256_setzero_ps();
+
+ // Main loop
+ for (; ib < nb; ++ib) {
+ /* Compute combined scale for the block */
+ const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+
+ __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+ __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+ bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
+ qx = _mm256_or_si256(qx, bxhi);
+
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+ /* Multiply q with scale and accumulate */
+ acc = _mm256_fmadd_ps(d, q, acc);
+ }
+
+ *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+ // Initialize accumulator with zeros
+ __m256 acc = _mm256_setzero_ps();
+ __m128i mask = _mm_set1_epi8((char)0xF0);
+
+ // Main loop
+ for (; ib < nb; ++ib) {
+ /* Compute combined scale for the block */
+ const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+
+ __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
+ const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+ __m128i bxhil = _mm256_castsi256_si128(bxhi);
+ __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+ bxhil = _mm_andnot_si128(bxhil, mask);
+ bxhih = _mm_andnot_si128(bxhih, mask);
+ __m128i bxl = _mm256_castsi256_si128(bx_0);
+ __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
+ bxl = _mm_or_si128(bxl, bxhil);
+ bxh = _mm_or_si128(bxh, bxhih);
+ bx_0 = MM256_SET_M128I(bxh, bxl);
+
+ const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+ const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
+
+ /* Multiply q with scale and accumulate */
+ acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+ }
+
+ *s = hsum_float_8(acc);
+#else
+ UNUSED(nb);
+ UNUSED(ib);
+ UNUSED(x);
+ UNUSED(y);
+ ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ const int qk = QK8_1;
+ const int nb = n / qk;
+
+ int ib = 0;
+
+ assert(n % qk == 0);
+ assert(qk == QK5_1);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_q5_1 * GGML_RESTRICT x = vx;
+ const block_q8_1 * GGML_RESTRICT y = vy;
+
+#if defined(__AVX2__)
+ // Initialize accumulator with zeros
+ __m256 acc = _mm256_setzero_ps();
+
+ float summs = 0.0f;
+
+ // Main loop
+ for (; ib < nb; ++ib) {
+ const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
+
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+
+ __m256i qx = bytes_from_nibbles_32(x[ib].qs);
+ __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+ bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
+ qx = _mm256_or_si256(qx, bxhi);
+
+ const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
+ const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+ const __m256 q = mul_sum_us8_pairs_float(qx, qy);
+
+ acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
+ }
+
+ *s = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+ // Initialize accumulator with zeros
+ __m256 acc = _mm256_setzero_ps();
+ __m128i mask = _mm_set1_epi8(0x10);
+
+ float summs = 0.0f;
+
+ // Main loop
+ for (; ib < nb; ++ib) {
+ const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
+
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
+
+ __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
+ const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
+ __m128i bxhil = _mm256_castsi256_si128(bxhi);
+ __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+ bxhil = _mm_and_si128(bxhil, mask);
+ bxhih = _mm_and_si128(bxhih, mask);
+ __m128i bxl = _mm256_castsi256_si128(bx_0);
+ __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
+ bxl = _mm_or_si128(bxl, bxhil);
+ bxh = _mm_or_si128(bxh, bxhih);
+ bx_0 = MM256_SET_M128I(bxh, bxl);
+
+ const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
+ const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+ const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
+
+ acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
+ }
+
+ *s = hsum_float_8(acc) + summs;
+#else
+ UNUSED(nb);
+ UNUSED(ib);
+ UNUSED(x);
+ UNUSED(y);
+ ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ const int qk = QK8_0;
+ const int nb = n / qk;
+
+ assert(n % qk == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_q8_0 * GGML_RESTRICT x = vx;
+ const block_q8_0 * GGML_RESTRICT y = vy;
+
+ int ib = 0;
+ float sumf = 0;
+
+#if defined(__AVX2__)
+ // Initialize accumulator with zeros
+ __m256 acc = _mm256_setzero_ps();
+
+ // Main loop
+ for (; ib < nb; ++ib) {
+ // Compute combined scale for the block
+ const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
+ __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
+
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
+
+ // Multiply q with scale and accumulate
+ acc = _mm256_fmadd_ps( d, q, acc );
+ }
+
+ sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+ __m256 accum = _mm256_setzero_ps();
+
+ for (; ib + 1 < nb; ib += 2) {
+ const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
+ const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
+ const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+ const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
+ const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
+ const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
+ const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+ const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+ const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
+ const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+ }
+
+ sumf = hsum_float_8(accum);
+#endif
+ for (; ib < nb; ++ib) {
+ int sumi = 0;
+
+ for (int j = 0; j < qk; j++) {
+ sumi += x[ib].qs[j]*y[ib].qs[j];
+ }
+
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
+ }
+
+ *s = sumf;
+}
+
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_tq1_0 * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+ __m256 sumf = _mm256_setzero_ps();
+
+ for (int i = 0; i < nb; ++i) {
+ // 16-bit sums
+ __m256i sumi0 = _mm256_setzero_si256();
+ __m256i sumi1 = _mm256_setzero_si256();
+ __m256i sumi2 = _mm256_setzero_si256();
+
+ // first 32 bytes of 5 elements
+ {
+ __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs));
+ // 8-bit multiplies with shifts, masks and adds
+ __m256i qx1 = _mm256_add_epi8(qx0, _mm256_add_epi8(qx0, qx0)); // 1 * 3
+ __m256i qx2 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx0, 3), _mm256_set1_epi8(-8)), qx0); // 1 * 9
+ __m256i qx3 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx1, 3), _mm256_set1_epi8(-8)), qx1); // 3 * 9
+ __m256i qx4 = _mm256_add_epi8(_mm256_and_si256(_mm256_slli_epi16(qx2, 3), _mm256_set1_epi8(-8)), qx2); // 9 * 9
+
+ // TODO: can _mm256_mulhi_epu16 be faster even if 16-bits?
+
+ // Cancel the +1 from avg so that it behaves like a halving add
+ qx0 = _mm256_subs_epu8(qx0, _mm256_set1_epi8(1));
+ qx1 = _mm256_subs_epu8(qx1, _mm256_set1_epi8(1));
+ qx2 = _mm256_subs_epu8(qx2, _mm256_set1_epi8(1));
+ qx3 = _mm256_subs_epu8(qx3, _mm256_set1_epi8(1));
+ qx4 = _mm256_subs_epu8(qx4, _mm256_set1_epi8(1));
+ // Multiply by 3 and get the top 2 bits
+ qx0 = _mm256_avg_epu8(qx0, _mm256_avg_epu8(qx0, _mm256_setzero_si256()));
+ qx1 = _mm256_avg_epu8(qx1, _mm256_avg_epu8(qx1, _mm256_setzero_si256()));
+ qx2 = _mm256_avg_epu8(qx2, _mm256_avg_epu8(qx2, _mm256_setzero_si256()));
+ qx3 = _mm256_avg_epu8(qx3, _mm256_avg_epu8(qx3, _mm256_setzero_si256()));
+ qx4 = _mm256_avg_epu8(qx4, _mm256_avg_epu8(qx4, _mm256_setzero_si256()));
+ qx0 = _mm256_and_si256(_mm256_srli_epi16(qx0, 6), _mm256_set1_epi8(3));
+ qx1 = _mm256_and_si256(_mm256_srli_epi16(qx1, 6), _mm256_set1_epi8(3));
+ qx2 = _mm256_and_si256(_mm256_srli_epi16(qx2, 6), _mm256_set1_epi8(3));
+ qx3 = _mm256_and_si256(_mm256_srli_epi16(qx3, 6), _mm256_set1_epi8(3));
+ qx4 = _mm256_and_si256(_mm256_srli_epi16(qx4, 6), _mm256_set1_epi8(3));
+
+ const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 0));
+ const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 32));
+ const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 64));
+ const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 96));
+ const __m256i qy4 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 128));
+
+ qx0 = _mm256_maddubs_epi16(qx0, qy0);
+ qx1 = _mm256_maddubs_epi16(qx1, qy1);
+ qx2 = _mm256_maddubs_epi16(qx2, qy2);
+ qx3 = _mm256_maddubs_epi16(qx3, qy3);
+ qx4 = _mm256_maddubs_epi16(qx4, qy4);
+
+ sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
+ sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
+ sumi2 = _mm256_add_epi16(sumi2, qx4);
+ }
+
+ // last 16 bytes of 5-element, along with the 4 bytes of 4 elements
+ {
+ __m128i qx0 = _mm_loadu_si128((const __m128i *) (x[i].qs + 32));
+ uint32_t qh;
+ memcpy(&qh, x[i].qh, sizeof(qh)); // potentially unaligned
+ __m256i qx5_l = _mm256_cvtepu8_epi16(_mm_set1_epi32(qh));
+ __m128i qx1 = _mm_add_epi8(qx0, _mm_add_epi8(qx0, qx0)); // 1 * 3
+ __m128i qx2 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx0, 3), _mm_set1_epi8(-8)), qx0); // 1 * 9
+ __m128i qx3 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx1, 3), _mm_set1_epi8(-8)), qx1); // 3 * 9
+ __m128i qx4 = _mm_add_epi8(_mm_and_si128(_mm_slli_epi16(qx2, 3), _mm_set1_epi8(-8)), qx2); // 9 * 9
+ __m256i qx01 = MM256_SET_M128I(qx1, qx0);
+ __m256i qx23 = MM256_SET_M128I(qx3, qx2);
+
+ // avx2 does not have 8-bit multiplies, so 16-bit it is.
+ qx5_l = _mm256_mullo_epi16(qx5_l, _mm256_set_epi16(27, 27, 27, 27, 9, 9, 9, 9, 3, 3, 3, 3, 1, 1, 1, 1));
+ qx5_l = _mm256_and_si256(qx5_l, _mm256_set1_epi16(0xFF));
+ __m128i qx5 = _mm_packus_epi16(_mm256_castsi256_si128(qx5_l), _mm256_extracti128_si256(qx5_l, 1));
+
+ __m256i qx45 = MM256_SET_M128I(qx5, qx4);
+
+ // Cancel the +1 from avg so that it behaves like a halving add
+ qx01 = _mm256_subs_epu8(qx01, _mm256_set1_epi8(1));
+ qx23 = _mm256_subs_epu8(qx23, _mm256_set1_epi8(1));
+ qx45 = _mm256_subs_epu8(qx45, _mm256_set1_epi8(1));
+ // Multiply by 3 and get the top 2 bits
+ qx01 = _mm256_avg_epu8(qx01, _mm256_avg_epu8(qx01, _mm256_setzero_si256()));
+ qx23 = _mm256_avg_epu8(qx23, _mm256_avg_epu8(qx23, _mm256_setzero_si256()));
+ qx45 = _mm256_avg_epu8(qx45, _mm256_avg_epu8(qx45, _mm256_setzero_si256()));
+ qx01 = _mm256_and_si256(_mm256_srli_epi16(qx01, 6), _mm256_set1_epi8(3));
+ qx23 = _mm256_and_si256(_mm256_srli_epi16(qx23, 6), _mm256_set1_epi8(3));
+ qx45 = _mm256_and_si256(_mm256_srli_epi16(qx45, 6), _mm256_set1_epi8(3));
+
+ const __m256i qy01 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 160));
+ const __m256i qy23 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 192));
+ const __m256i qy45 = _mm256_loadu_si256((const __m256i *) (y[i].qs + 224));
+
+ qx01 = _mm256_maddubs_epi16(qx01, qy01);
+ qx23 = _mm256_maddubs_epi16(qx23, qy23);
+ qx45 = _mm256_maddubs_epi16(qx45, qy45);
+
+ sumi0 = _mm256_add_epi16(sumi0, qx01);
+ sumi1 = _mm256_add_epi16(sumi1, qx23);
+ sumi2 = _mm256_add_epi16(sumi2, qx45);
+ }
+
+ const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
+
+ sumi0 = _mm256_sub_epi16(sumi0, ysum);
+ sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
+ sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
+
+ sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
+ }
+
+ *s = hsum_float_8(sumf);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_tq1_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_tq2_0 * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+ __m256 sumf = _mm256_setzero_ps();
+
+ for (int i = 0; i < nb; ++i) {
+ // 16-bit sums, because 256*127 still fits
+ __m256i sumi0 = _mm256_setzero_si256();
+ __m256i sumi1 = _mm256_setzero_si256();
+
+ for (size_t j = 0; j < sizeof(x->qs); j += 32) {
+ __m256i qx0 = _mm256_loadu_si256((const __m256i *) (x[i].qs + j));
+ __m256i qx1 = _mm256_srli_epi16(qx0, 2);
+ __m256i qx2 = _mm256_srli_epi16(qx0, 4);
+ __m256i qx3 = _mm256_srli_epi16(qx0, 6);
+
+ // 0, 1, 2 (should not be 3)
+ qx0 = _mm256_and_si256(qx0, _mm256_set1_epi8(3));
+ qx1 = _mm256_and_si256(qx1, _mm256_set1_epi8(3));
+ qx2 = _mm256_and_si256(qx2, _mm256_set1_epi8(3));
+ qx3 = _mm256_and_si256(qx3, _mm256_set1_epi8(3));
+
+ const __m256i qy0 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 0));
+ const __m256i qy1 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 32));
+ const __m256i qy2 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 64));
+ const __m256i qy3 = _mm256_loadu_si256((const __m256i *) (y[i].qs + j*4 + 96));
+
+ qx0 = _mm256_maddubs_epi16(qx0, qy0);
+ qx1 = _mm256_maddubs_epi16(qx1, qy1);
+ qx2 = _mm256_maddubs_epi16(qx2, qy2);
+ qx3 = _mm256_maddubs_epi16(qx3, qy3);
+
+ sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(qx0, qx1));
+ sumi1 = _mm256_add_epi16(sumi1, _mm256_add_epi16(qx2, qx3));
+ }
+
+ const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
+
+ sumi0 = _mm256_add_epi16(sumi0, sumi1);
+ sumi0 = _mm256_sub_epi16(sumi0, ysum);
+ sumi0 = _mm256_madd_epi16(sumi0, _mm256_set1_epi16(1));
+
+ sumf = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(sumi0), d), sumf);
+ }
+
+ *s = hsum_float_8(sumf);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_tq2_0_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_q2_K * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined __AVX2__
+
+ const __m256i m3 = _mm256_set1_epi8(3);
+ const __m128i m4 = _mm_set1_epi8(0xF);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+ const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
+ const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
+ const __m256i mins = _mm256_cvtepi8_epi16(mins8);
+ const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
+
+ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
+
+ const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
+ const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
+ const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
+
+ __m256i sumi = _mm256_setzero_si256();
+
+ for (int j = 0; j < QK_K/128; ++j) {
+
+ const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
+
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+ const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
+ const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
+ const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
+ const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
+
+ __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
+ __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
+ __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
+ __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
+
+ p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
+ p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
+ p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
+ p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
+
+ p0 = _mm256_add_epi32(p0, p1);
+ p2 = _mm256_add_epi32(p2, p3);
+
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
+ }
+
+ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+
+ }
+
+ *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+ const __m128i m3 = _mm_set1_epi8(0x3);
+ const __m128i m4 = _mm_set1_epi8(0xF);
+ const __m128i m2 = _mm_set1_epi8(0x2);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ // load mins and scales from block_q2_K.scales[QK_K/16]
+ const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+ const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
+ const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
+ const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
+ const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
+
+ // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
+ const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
+ const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
+
+ // sumf += -dmin * summs in 32bits*8
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
+
+ const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
+ const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
+ const __m128i scales[2] = { scales_0, scales_1 };
+
+ __m128i sumi_0 = _mm_setzero_si128();
+ __m128i sumi_1 = _mm_setzero_si128();
+
+ for (int j = 0; j < QK_K/128; ++j) {
+
+ // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
+ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+ // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
+ __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
+ const __m128i q2_0 = _mm_and_si128(q2bits, m3);
+ const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+ const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+ const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+ q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
+ const __m128i q2_1 = _mm_and_si128(q2bits, m3);
+ const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
+ const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
+ const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
+
+ // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
+ __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
+ __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
+ __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
+ __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
+ __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
+ __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
+ __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
+ __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
+
+ // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
+ __m128i shuffle = _mm_set1_epi16(0x0100);
+ p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
+
+ p0 = _mm_add_epi32(p0, p1);
+ p2 = _mm_add_epi32(p2, p3);
+ p4 = _mm_add_epi32(p4, p5);
+ p6 = _mm_add_epi32(p6, p7);
+
+ // isum in 32bits*4*2
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
+ }
+
+ // sumf += dall * isum - dmin * summs in 32bits
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
+ }
+
+ *s = hsum_float_8(acc);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_q2_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const uint32_t kmask1 = 0x03030303;
+ const uint32_t kmask2 = 0x0f0f0f0f;
+
+ const block_q3_K * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined __AVX2__
+
+ const __m256i m3 = _mm256_set1_epi8(3);
+ const __m256i mone = _mm256_set1_epi8(1);
+ const __m128i m32 = _mm_set1_epi8(32);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ uint32_t aux[3];
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ // Set up scales
+ memcpy(aux, x[i].scales, 12);
+ __m128i scales128 = _mm_set_epi32(
+ ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+ ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+ (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+ (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+ scales128 = _mm_sub_epi8(scales128, m32);
+ const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
+ const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
+ const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
+
+ // high bit
+ const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
+
+ // integer accumulator
+ __m256i sumi = _mm256_setzero_si256();
+
+ int bit = 0;
+ int is = 0;
+
+ for (int j = 0; j < QK_K/128; ++j) {
+ // load low 2 bits
+ const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
+
+ // prepare low and high bits
+ const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
+ const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+ ++bit;
+
+ const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
+ const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+ ++bit;
+
+ const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
+ const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+ ++bit;
+
+ const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
+ const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
+ ++bit;
+
+ // load Q8 quants
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
+ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+ // and 2 if the high bit was set)
+ __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
+ __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
+ __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
+ __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
+
+ __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
+ __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
+ __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
+ __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
+
+ p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+ p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+ p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
+ p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+
+ // multiply with scales
+ p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
+ p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
+ p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
+ p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
+
+ // accumulate
+ p16_0 = _mm256_add_epi32(p16_0, p16_1);
+ p16_2 = _mm256_add_epi32(p16_2, p16_3);
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
+
+ }
+
+ // multiply with block scale and accumulate
+ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+
+ }
+
+ *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+ const __m128i m3 = _mm_set1_epi8(3);
+ const __m128i mone = _mm_set1_epi8(1);
+ const __m128i m32 = _mm_set1_epi8(32);
+ const __m128i m2 = _mm_set1_epi8(2);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ const uint32_t *aux;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ // Set up scales
+ aux = (const uint32_t *)x[i].scales;
+ __m128i scales128 = _mm_set_epi32(
+ ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
+ ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
+ (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
+ (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
+ scales128 = _mm_sub_epi8(scales128, m32);
+ const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
+ const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
+ const __m128i scales[2] = { scales_0, scales_1 };
+
+ // high bit *128*2 from block_q3_K.hmask[QK_K/8]
+ const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
+ const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
+
+ // integer accumulator
+ __m128i sumi_0 = _mm_setzero_si128();
+ __m128i sumi_1 = _mm_setzero_si128();
+
+ for (int j = 0; j < QK_K/128; ++j) {
+ // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
+ const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
+ const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
+
+ // prepare low and high bits
+ const int bit = j << 2;
+
+ const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
+ const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
+ const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
+ const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
+
+ const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
+ const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
+ const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
+ const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
+
+ const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
+ const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
+ const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
+ const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
+
+ const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
+ const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
+ const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
+ const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
+
+ // load Q8 quants from block_q8_K.qs[QK_K]
+ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
+ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
+ // and 2 if the high bit was set)
+ __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
+ __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
+ __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
+ __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
+ __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
+ __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
+ __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
+ __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
+
+ __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
+ __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
+ __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
+ __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
+ __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
+ __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
+ __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
+ __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
+
+ p16_0 = _mm_sub_epi16(p16_0, q8s_0);
+ p16_1 = _mm_sub_epi16(p16_1, q8s_1);
+ p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+ p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+ p16_4 = _mm_sub_epi16(p16_4, q8s_4);
+ p16_5 = _mm_sub_epi16(p16_5, q8s_5);
+ p16_6 = _mm_sub_epi16(p16_6, q8s_6);
+ p16_7 = _mm_sub_epi16(p16_7, q8s_7);
+
+ // multiply with scales
+ __m128i shuffle = _mm_set1_epi16(0x0100);
+ p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
+
+ // accumulate
+ p16_0 = _mm_add_epi32(p16_0, p16_1);
+ p16_2 = _mm_add_epi32(p16_2, p16_3);
+ p16_4 = _mm_add_epi32(p16_4, p16_5);
+ p16_6 = _mm_add_epi32(p16_6, p16_7);
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
+
+ }
+
+ // multiply with block scale and accumulate
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
+
+ }
+
+ *s = hsum_float_8(acc);
+
+#else
+ UNUSED(kmask1);
+ UNUSED(kmask2);
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_q3_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_q4_K * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+ static const uint32_t kmask1 = 0x3f3f3f3f;
+ static const uint32_t kmask2 = 0x0f0f0f0f;
+ static const uint32_t kmask3 = 0x03030303;
+
+ uint32_t utmp[4];
+
+#if defined __AVX2__
+
+ const __m256i m4 = _mm256_set1_epi8(0xF);
+
+ __m256 acc = _mm256_setzero_ps();
+ __m128 acc_m = _mm_setzero_ps();
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+ memcpy(utmp, x[i].scales, 12);
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+ const uint32_t uaux = utmp[1] & kmask1;
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+ utmp[2] = uaux;
+ utmp[0] &= kmask1;
+
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+ const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
+ const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+ const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
+ acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
+
+ const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
+
+ __m256i sumi = _mm256_setzero_si256();
+
+ for (int j = 0; j < QK_K/64; ++j) {
+
+ const __m256i scale_l = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
+ const __m256i scale_h = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
+
+ const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+ const __m256i q4l = _mm256_and_si256(q4bits, m4);
+ const __m256i q4h = _mm256_and_si256(_mm256_srli_epi16(q4bits, 4), m4);
+
+ const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
+ p16l = _mm256_madd_epi16(scale_l, p16l);
+
+ const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
+ p16h = _mm256_madd_epi16(scale_h, p16h);
+ const __m256i sumj = _mm256_add_epi32(p16l, p16h);
+
+ sumi = _mm256_add_epi32(sumi, sumj);
+ }
+
+ __m256 vd = _mm256_set1_ps(d);
+ acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
+
+ }
+
+ acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
+ acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
+
+ *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
+
+#elif defined __AVX__
+
+ const __m128i m4 = _mm_set1_epi8(0xF);
+ const __m128i m2 = _mm_set1_epi8(0x2);
+
+ __m256 acc = _mm256_setzero_ps();
+ __m128 acc_m = _mm_setzero_ps();
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ memcpy(utmp, x[i].scales, 12);
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+ const uint32_t uaux = utmp[1] & kmask1;
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+ utmp[2] = uaux;
+ utmp[0] &= kmask1;
+
+ const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
+ const __m128i scales = _mm_cvtepu8_epi16(utmps);
+ const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
+
+ const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
+ const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
+ const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
+ const __m128i prod = _mm_madd_epi16(mins, q8s);
+ acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
+
+ __m128i sumi_0 = _mm_setzero_si128();
+ __m128i sumi_1 = _mm_setzero_si128();
+
+ __m128i shuffle = _mm_set1_epi16(0x0100);
+ for (int j = 0; j < QK_K/64; ++j) {
+
+ const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
+ shuffle = _mm_add_epi16(shuffle, m2);
+
+ __m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+ const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
+ const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
+ q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+ const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
+ const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
+
+ const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ __m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
+ p16l = _mm_madd_epi16(scale_l, p16l);
+ sumi_0 = _mm_add_epi32(sumi_0, p16l);
+ const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
+ p16l = _mm_madd_epi16(scale_l, p16l);
+ sumi_1 = _mm_add_epi32(sumi_1, p16l);
+
+ const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ __m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
+ p16h = _mm_madd_epi16(scale_h, p16h);
+ sumi_0 = _mm_add_epi32(sumi_0, p16h);
+ const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
+ p16h = _mm_madd_epi16(scale_h, p16h);
+ sumi_1 = _mm_add_epi32(sumi_1, p16h);
+
+ }
+
+ __m256 vd = _mm256_set1_ps(d);
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
+
+ }
+
+ acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
+ acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
+
+ *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ UNUSED(kmask1);
+ UNUSED(kmask2);
+ UNUSED(kmask3);
+ UNUSED(utmp);
+ ggml_vec_dot_q4_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_q5_K * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+ static const uint32_t kmask1 = 0x3f3f3f3f;
+ static const uint32_t kmask2 = 0x0f0f0f0f;
+ static const uint32_t kmask3 = 0x03030303;
+
+ uint32_t utmp[4];
+
+#if defined __AVX2__
+
+ const __m256i m4 = _mm256_set1_epi8(0xF);
+ const __m128i mzero = _mm_setzero_si128();
+ const __m256i mone = _mm256_set1_epi8(1);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ float summs = 0.f;
+
+ for (int i = 0; i < nb; ++i) {
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+ memcpy(utmp, x[i].scales, 12);
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+ const uint32_t uaux = utmp[1] & kmask1;
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+ utmp[2] = uaux;
+ utmp[0] &= kmask1;
+
+ const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
+
+ const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
+ const __m128i q8s = _mm_hadd_epi16(_mm256_extracti128_si256(q8sums, 0), _mm256_extracti128_si256(q8sums, 1));
+ const __m128i prod = _mm_madd_epi16(_mm256_extracti128_si256(mins_and_scales, 1), q8s);
+ const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
+ summs += dmin * _mm_extract_epi32(hsum, 0);
+
+ const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
+
+ const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
+ __m256i hmask = mone;
+
+ __m256i sumi = _mm256_setzero_si256();
+
+ int bit = 0;
+
+ for (int j = 0; j < QK_K/64; ++j) {
+
+ const __m256i scale_0 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+0));
+ const __m256i scale_1 = _mm256_shuffle_epi8(scales, get_scale_shuffle_k4(2*j+1));
+
+ const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5); q5 += 32;
+
+ const __m256i q5l_0 = _mm256_and_si256(q5bits, m4);
+ const __m256i q5h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
+ const __m256i q5_0 = _mm256_add_epi8(q5l_0, q5h_0);
+ hmask = _mm256_slli_epi16(hmask, 1);
+
+ const __m256i q5l_1 = _mm256_and_si256(_mm256_srli_epi16(q5bits, 4), m4);
+ const __m256i q5h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_and_si256(hbits, hmask), bit++), 4);
+ const __m256i q5_1 = _mm256_add_epi8(q5l_1, q5h_1);
+ hmask = _mm256_slli_epi16(hmask, 1);
+
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+ __m256i p16_0 = _mm256_maddubs_epi16(q5_0, q8_0);
+ __m256i p16_1 = _mm256_maddubs_epi16(q5_1, q8_1);
+
+ p16_0 = _mm256_madd_epi16(scale_0, p16_0);
+ p16_1 = _mm256_madd_epi16(scale_1, p16_1);
+
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+
+ }
+
+ __m256 vd = _mm256_set1_ps(d);
+ acc = _mm256_fmadd_ps(vd, _mm256_cvtepi32_ps(sumi), acc);
+
+ }
+
+ *s = hsum_float_8(acc) + summs;
+
+#elif defined __AVX__
+
+ const __m128i m4 = _mm_set1_epi8(0xF);
+ const __m128i mzero = _mm_setzero_si128();
+ const __m128i mone = _mm_set1_epi8(1);
+ const __m128i m2 = _mm_set1_epi8(2);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ float summs = 0.f;
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
+
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ memcpy(utmp, x[i].scales, 12);
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+ const uint32_t uaux = utmp[1] & kmask1;
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+ utmp[2] = uaux;
+ utmp[0] &= kmask1;
+
+ const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
+ const __m128i scales = _mm_cvtepu8_epi16(utmps);
+ const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
+
+ const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
+ const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
+ const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
+ const __m128i prod = _mm_madd_epi16(mins, q8s);
+ const __m128i hsum = _mm_hadd_epi32(_mm_hadd_epi32(prod, mzero), mzero);
+ summs += dmin * _mm_extract_epi32(hsum, 0);
+
+ const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].qh[0]);
+ const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].qh[16]);
+ __m128i hmask = mone;
+
+ __m128i sumi_0 = _mm_setzero_si128();
+ __m128i sumi_1 = _mm_setzero_si128();
+
+ int bit = 0;
+
+ __m128i shuffle = _mm_set1_epi16(0x0100);
+ for (int j = 0; j < QK_K/64; ++j) {
+
+ const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
+ shuffle = _mm_add_epi16(shuffle, m2);
+ const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
+ shuffle = _mm_add_epi16(shuffle, m2);
+
+ const __m128i q5bits_0 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
+ const __m128i q5bits_1 = _mm_loadu_si128((const __m128i*)q5); q5 += 16;
+
+ __m128i q5l_0 = _mm_and_si128(q5bits_0, m4);
+ __m128i q5l_1 = _mm_and_si128(q5bits_1, m4);
+ __m128i q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
+ __m128i q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
+ __m128i q5_0 = _mm_add_epi8(q5l_0, q5h_0);
+ __m128i q5_1 = _mm_add_epi8(q5l_1, q5h_1);
+ hmask = _mm_slli_epi16(hmask, 1);
+
+ __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ __m128i p16_0 = _mm_maddubs_epi16(q5_0, q8_0);
+ __m128i p16_1 = _mm_maddubs_epi16(q5_1, q8_1);
+ p16_0 = _mm_madd_epi16(scale_0, p16_0);
+ p16_1 = _mm_madd_epi16(scale_0, p16_1);
+
+ q5l_0 = _mm_and_si128(_mm_srli_epi16(q5bits_0, 4), m4);
+ q5l_1 = _mm_and_si128(_mm_srli_epi16(q5bits_1, 4), m4);
+ q5h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_0, hmask), bit), 4);
+ q5h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_and_si128(hbits_1, hmask), bit++), 4);
+ q5_0 = _mm_add_epi8(q5l_0, q5h_0);
+ q5_1 = _mm_add_epi8(q5l_1, q5h_1);
+ hmask = _mm_slli_epi16(hmask, 1);
+
+ q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ __m128i p16_2 = _mm_maddubs_epi16(q5_0, q8_0);
+ __m128i p16_3 = _mm_maddubs_epi16(q5_1, q8_1);
+ p16_2 = _mm_madd_epi16(scale_1, p16_2);
+ p16_3 = _mm_madd_epi16(scale_1, p16_3);
+
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+
+ }
+
+ __m256 vd = _mm256_set1_ps(d);
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
+
+ }
+
+ *s = hsum_float_8(acc) + summs;
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ UNUSED(kmask1);
+ UNUSED(kmask2);
+ UNUSED(kmask3);
+ UNUSED(utmp);
+ ggml_vec_dot_q5_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_q6_K * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined __AVX2__
+
+ const __m256i m4 = _mm256_set1_epi8(0xF);
+ const __m256i m2 = _mm256_set1_epi8(3);
+ const __m256i m32s = _mm256_set1_epi8(32);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+
+ __m256i sumi = _mm256_setzero_si256();
+
+ int is = 0;
+
+ for (int j = 0; j < QK_K/128; ++j) {
+
+ const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+ const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+ const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+ const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+ is += 4;
+
+ const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+ const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
+ const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
+
+ const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
+ const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
+ const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
+ const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
+
+ const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
+ const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
+ const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
+ const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
+
+ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+ __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
+ __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
+ __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
+ __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
+
+ __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
+ __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
+ __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
+ __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
+
+ p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
+ p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
+ p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
+ p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
+
+ p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
+ p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
+ p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
+ p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
+
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
+
+ }
+
+ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
+ }
+
+ *s = hsum_float_8(acc);
+
+#elif defined __AVX__
+
+ const __m128i m3 = _mm_set1_epi8(3);
+ const __m128i m15 = _mm_set1_epi8(15);
+
+ __m256 acc = _mm256_setzero_ps();
+
+ for (int i = 0; i < nb; ++i) {
+
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ // handle the q6_k -32 offset separately using bsums
+ const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
+ const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
+ const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+ const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
+ const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
+ const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
+ const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
+
+ __m128i sumi_0 = _mm_setzero_si128();
+ __m128i sumi_1 = _mm_setzero_si128();
+
+ int is = 0;
+
+ for (int j = 0; j < QK_K/128; ++j) {
+
+ const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
+ const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
+
+ const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
+ const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
+ const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
+ const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
+ const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
+ const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
+ const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
+ const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
+
+ const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+ const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+ const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+ const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+
+ const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
+ const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
+ const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
+ const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
+ const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
+ const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
+ const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
+ const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
+
+ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+ __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
+ __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
+ __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
+ __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
+ __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
+ __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
+ __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
+ __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
+
+ const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+ const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+ const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+ const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+ is += 4;
+
+ p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
+ p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
+ p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
+ p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
+ p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
+ p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
+ p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
+ p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
+
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
+ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
+
+ }
+
+ sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
+ sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
+ const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
+ }
+
+ *s = hsum_float_8(acc);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_q6_K_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+#if defined (__AVX__) || defined (__AVX2__)
+static const int8_t keven_signs_q2xs[1024] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
+ 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
+ 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
+ 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
+ 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
+ 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
+ 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
+ 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
+ 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
+ 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
+ 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
+ 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
+ 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
+ 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
+ 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
+ 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
+ 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
+ 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
+ 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
+ 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
+ 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
+ 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
+ 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
+ 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
+ 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
+ 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
+ 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
+ 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
+ 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
+ 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
+ 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
+ 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+#endif
+
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_iq2_xxs * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+ uint32_t aux32[4];
+ const uint8_t * aux8 = (const uint8_t *)aux32;
+
+ __m256 accumf = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+ __m256i sumi1 = _mm256_setzero_si256();
+ __m256i sumi2 = _mm256_setzero_si256();
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+ const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+ const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+ const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
+ const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
+ signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
+ const uint16_t ls1 = aux32[1] >> 28;
+ const uint16_t ls2 = aux32[3] >> 28;
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+ sumi1 = _mm256_add_epi32(sumi1, p1);
+ sumi2 = _mm256_add_epi32(sumi2, p2);
+ }
+
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+ }
+
+ *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+ uint32_t aux32[4];
+ const uint8_t * aux8 = (const uint8_t *)aux32;
+
+ __m256 accumf = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+ __m128i sumi1_0 = _mm_setzero_si128();
+ __m128i sumi1_1 = _mm_setzero_si128();
+ __m128i sumi2_0 = _mm_setzero_si128();
+ __m128i sumi2_1 = _mm_setzero_si128();
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
+ const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
+ const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
+ const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
+ const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+ const uint16_t ls1 = aux32[1] >> 28;
+ const uint16_t ls2 = aux32[3] >> 28;
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+ }
+
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+ }
+
+ *s = 0.125f * hsum_float_8(accumf);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_iq2_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_iq2_xs * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+
+ const __m256i mone = _mm256_set1_epi8(1);
+ static const char block_sign_shuffle_mask_1[32] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+ };
+ static const char block_sign_shuffle_mask_2[32] = {
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+ };
+ static const uint8_t bit_selector_mask_bytes[32] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ };
+
+ const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
+ const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
+ const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
+
+ static const uint8_t k_bit_helper[32] = {
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+ };
+ const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
+ const __m256i m511 = _mm256_set1_epi16(511);
+ const __m128i m4 = _mm_set1_epi8(0xf);
+ const __m128i m1 = _mm_set1_epi8(1);
+
+ uint64_t aux64;
+
+ // somewhat hacky, but gives a significant boost in performance
+ __m256i aux_gindex;
+ const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+ __m256 accumf = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ memcpy(&aux64, x[i].scales, 8);
+ __m128i stmp = _mm_set1_epi64x(aux64);
+ stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
+ const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
+
+ __m256i sumi1 = _mm256_setzero_si256();
+ __m256i sumi2 = _mm256_setzero_si256();
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+ const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2); q2 += 16;
+ aux_gindex = _mm256_and_si256(q2_data, m511);
+
+ const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
+ const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
+ const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
+
+ const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
+ const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
+
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+
+ const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
+ iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
+ const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
+ iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
+ const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
+ iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
+ const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
+ iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+
+ const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
+ const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
+ const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
+ const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
+
+ __m256i signs;
+ signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
+
+ signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
+
+ signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+ const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
+
+ signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
+ signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
+ const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
+
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
+ const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3);
+ const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4);
+
+ const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
+ const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
+ const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
+ const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
+
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
+ }
+
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+ }
+
+ *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+ const __m128i mone = _mm_set1_epi8(1);
+ static const char block_sign_shuffle_mask_1[32] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+ };
+ static const char block_sign_shuffle_mask_2[32] = {
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
+ };
+ static const uint8_t bit_selector_mask_bytes[32] = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ };
+
+ const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
+ const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
+ const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
+ const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
+ const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
+ const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
+
+ static const uint8_t k_bit_helper[32] = {
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
+ };
+ const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
+ const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
+ const __m128i m511 = _mm_set1_epi16(511);
+ const __m128i m4 = _mm_set1_epi8(0xf);
+ const __m128i m1 = _mm_set1_epi8(1);
+
+ uint64_t aux64;
+
+ // somewhat hacky, but gives a significant boost in performance
+ __m256i aux_gindex;
+ const uint16_t * gindex = (const uint16_t *)&aux_gindex;
+
+ __m256 accumf = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ memcpy(&aux64, x[i].scales, 8);
+ __m128i stmp = _mm_set1_epi64x(aux64);
+ stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
+ const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
+
+ __m128i sumi1_0 = _mm_setzero_si128();
+ __m128i sumi1_1 = _mm_setzero_si128();
+ __m128i sumi2_0 = _mm_setzero_si128();
+ __m128i sumi2_1 = _mm_setzero_si128();
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
+
+ const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
+ const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16;
+ aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
+
+ const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
+ const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
+ const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
+ const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
+ const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
+ const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
+
+ const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
+ const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
+ const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
+ const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
+
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
+ const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
+ const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
+ const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
+ const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
+
+ // AVX2 full_signs_1 is full_sign_bits_0 here
+ // AVX2 full_signs_2 is full_sign_bits_1 here
+ __m128i signs_0, signs_1;
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
+
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
+
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+ const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
+ const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
+
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
+ const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
+ const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
+
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+ const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
+ const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
+ const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
+ const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
+
+ __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
+ const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
+ const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
+ const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
+ const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
+ const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
+ const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
+ const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
+ const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
+
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
+ }
+
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+ }
+
+ *s = 0.125f * hsum_float_8(accumf);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_iq2_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_iq2_s * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+ };
+
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ };
+
+ const __m128i m4 = _mm_set1_epi8(0xf);
+ const __m128i m1 = _mm_set1_epi8(1);
+
+ const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
+ const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
+
+ uint64_t aux64;
+
+ __m256 accumf = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ memcpy(&aux64, x[i].scales, 8);
+ const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
+ const __m256i scales16 = _mm256_cvtepi8_epi16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
+
+ __m256i sumi1 = _mm256_setzero_si256();
+ __m256i sumi2 = _mm256_setzero_si256();
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i q2_1 = _mm256_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
+ iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+ const __m256i q2_2 = _mm256_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
+ iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+ qs += 8;
+
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+ const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
+ const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
+
+ aux256 = _mm256_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+ const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
+ const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
+
+ signs += 4;
+
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
+
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+0)));
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_shuffle_epi8(scales16, get_scale_shuffle_k4(ib32+1)));
+ sumi1 = _mm256_add_epi32(sumi1, p1);
+ sumi2 = _mm256_add_epi32(sumi2, p2);
+ }
+
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+ }
+
+ *s = 0.125f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+ };
+
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ };
+
+ const __m128i m4 = _mm_set1_epi8(0xf);
+ const __m128i m1 = _mm_set1_epi8(1);
+
+ const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
+ const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
+ const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
+ const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
+
+ uint64_t aux64;
+
+ __m256 accumf = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+
+ memcpy(&aux64, x[i].scales, 8);
+ const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
+ const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
+ const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
+
+ __m128i sumi1_0 = _mm_setzero_si128();
+ __m128i sumi1_1 = _mm_setzero_si128();
+ __m128i sumi2_0 = _mm_setzero_si128();
+ __m128i sumi2_1 = _mm_setzero_si128();
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
+ qs += 8;
+
+ __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
+ __m128i aux128_1 = aux128_0;
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+ const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+ const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+ const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
+ const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
+
+ aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
+ aux128_1 = aux128_0;
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+ const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+ const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+ const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
+ const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
+
+ signs += 4;
+
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+ }
+
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+ }
+
+ *s = 0.125f * hsum_float_8(accumf);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_iq2_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_iq3_xxs * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+ uint32_t aux32[2];
+
+ __m256 accumf = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+ __m256i sumi1 = _mm256_setzero_si256();
+ __m256i sumi2 = _mm256_setzero_si256();
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+ q3 += 8;
+ const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+ q3 += 8;
+ memcpy(aux32, gas, 8); gas += 8;
+ const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
+ signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
+ const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
+ const uint16_t ls1 = aux32[0] >> 28;
+ const uint16_t ls2 = aux32[1] >> 28;
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+ sumi1 = _mm256_add_epi32(sumi1, p1);
+ sumi2 = _mm256_add_epi32(sumi2, p2);
+ }
+
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+ }
+
+ *s = 0.25f * hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+
+ uint32_t aux32[2];
+
+ __m256 accumf = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+ __m128i sumi1_0 = _mm_setzero_si128();
+ __m128i sumi1_1 = _mm_setzero_si128();
+ __m128i sumi2_0 = _mm_setzero_si128();
+ __m128i sumi2_1 = _mm_setzero_si128();
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+ const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
+ q3 += 8;
+ const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
+ const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
+ q3 += 8;
+ memcpy(aux32, gas, 8); gas += 8;
+ const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
+ const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
+ const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
+ const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+ const uint16_t ls1 = aux32[0] >> 28;
+ const uint16_t ls2 = aux32[1] >> 28;
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+ }
+
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+ }
+
+ *s = 0.25f * hsum_float_8(accumf);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_iq3_xxs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_iq3_s * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined(__AVX2__)
+
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+ };
+
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ };
+
+ const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1);
+ const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2);
+
+ const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+ const __m256i idx_mask = _mm256_set1_epi32(256);
+
+ typedef union {
+ __m256i vec[2];
+ uint32_t index[16];
+ } index_t;
+
+ index_t idx;
+
+ __m256 accumf = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+ __m256i sumi1 = _mm256_setzero_si256();
+ __m256i sumi2 = _mm256_setzero_si256();
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16;
+ idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]);
+ idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]);
+ idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask);
+ idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask);
+ idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l)));
+ idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1)));
+
+ // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
+ //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
+ //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
+ const __m256i q2_1 = _mm256_set_epi32(
+ iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
+ iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
+ );
+ const __m256i q2_2 = _mm256_set_epi32(
+ iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
+ iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
+ );
+
+ __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16));
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+ const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2);
+ const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1);
+
+ aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16));
+ aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2);
+ const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2);
+ const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2);
+
+ signs += 4;
+
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
+ const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+ const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
+ sumi1 = _mm256_add_epi32(sumi1, p1);
+ sumi2 = _mm256_add_epi32(sumi2, p2);
+ }
+
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
+
+ }
+
+ *s = hsum_float_8(accumf);
+
+#elif defined(__AVX__)
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+ };
+
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ };
+
+ const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
+ const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
+ const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
+ const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
+
+ const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
+ const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
+ const __m128i idx_mask = _mm_set1_epi32(256);
+
+ typedef union {
+ __m128i vec[4];
+ uint32_t index[16];
+ } index_t;
+
+ index_t idx;
+
+ __m256 accumf = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
+ __m128i sumi1_0 = _mm_setzero_si128();
+ __m128i sumi1_1 = _mm_setzero_si128();
+ __m128i sumi2_0 = _mm_setzero_si128();
+ __m128i sumi2_1 = _mm_setzero_si128();
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
+ const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
+ const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
+ idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
+ idx.vec[1] = idx.vec[0];
+ idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
+ idx.vec[3] = idx.vec[2];
+
+ idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
+ idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
+ idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
+ idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
+
+ idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
+ idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
+ idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
+ idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
+
+ const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
+ const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
+ const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
+ const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
+
+ __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
+ __m128i aux128_1 = aux128_0;
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+ const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+ const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+ const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
+ const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
+
+ aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
+ aux128_1 = aux128_0;
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
+ const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
+ const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
+ const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
+ const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
+
+ signs += 4;
+
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
+ const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
+ const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
+ }
+
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
+
+ }
+
+ *s = hsum_float_8(accumf);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_iq3_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_iq1_s * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined __AVX2__
+
+ __m256 accum = _mm256_setzero_ps();
+ float accum1 = 0;
+ for (int i = 0; i < nb; ++i) {
+
+ const int8_t * q8 = y[i].qs;
+ const uint8_t * qs = x[i].qs;
+ const uint16_t * qh = x[i].qh;
+
+ __m256i sumi = _mm256_setzero_si256();
+ int sumi1 = 0;
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
+#ifdef __BMI2__
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
+#else
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
+ iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
+ iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+#endif
+ qs += 8;
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+ const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+ const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(ls1));
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(ls2));
+
+ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2));
+ sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+ }
+
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+ accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
+ accum1 += d * sumi1;
+
+ }
+
+ *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
+#elif defined __AVX__
+ __m256 accum = _mm256_setzero_ps();
+ float accum1 = 0;
+ for (int i = 0; i < nb; ++i) {
+
+ const int8_t * q8 = y[i].qs;
+ const uint8_t * qs = x[i].qs;
+ const uint16_t * qh = x[i].qh;
+
+ __m128i sumi1_0 = _mm_setzero_si128();
+ __m128i sumi1_1 = _mm_setzero_si128();
+ int sumi1 = 0;
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
+ const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
+ const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
+ const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+ const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
+ qs += 8;
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+ const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
+ const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
+ const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
+ const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
+ const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
+ const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
+
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
+ sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
+ }
+
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
+ accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
+ accum1 += d * sumi1;
+
+ }
+
+ *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_iq1_s_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(n % QK_K == 0);
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+
+ const block_iq1_m * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+ iq1m_scale_t scale;
+
+#if defined __AVX2__
+
+ const __m256i mask = _mm256_set1_epi16(0x7);
+ const __m256i mone = _mm256_set1_epi16(1);
+ const __m256i mone8 = _mm256_set1_epi8(1);
+ const __m256i mtwo8 = _mm256_set1_epi8(2);
+ // VPSHUFB cannot cross 128-bit lanes so odd shifts go to upper half.
+ const __m256i scales_shift = _mm256_set_epi64x(9, 3, 6, 0);
+
+ __m256 accum1 = _mm256_setzero_ps();
+ __m256 accum2 = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+
+ const int8_t * q8 = y[i].qs;
+ const uint8_t * qs = x[i].qs;
+ const uint8_t * qh = x[i].qh;
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+ // Extract 3-bit scales (16 values)
+ __m256i scales = _mm256_set1_epi64x(*(const uint64_t*)sc);
+ scales = _mm256_srlv_epi64(scales, scales_shift);
+ scales = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scales, mask), 1), mone);
+
+ // Indices to repeat each scale 8 times.
+ __m256i scales_idx1 = _mm256_set1_epi16(0x0100);
+ __m256i scales_idx2 = _mm256_add_epi8(scales_idx1, _mm256_set1_epi8(8));
+
+ __m256i sumi1 = _mm256_setzero_si256();
+ __m256i sumi2 = _mm256_setzero_si256();
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
+#ifdef __BMI2__
+ const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
+ | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
+ const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
+ | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
+ const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
+ const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
+ const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
+ const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
+
+ // Convert signs to bytes 0x81 (negative) or 0x01 (positive)
+ const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
+ const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
+ const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
+#else
+ const __m256i q1b_1 = _mm256_set_epi64x(
+ iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
+ iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
+ );
+ const __m256i q1b_2 = _mm256_set_epi64x(
+ iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
+ iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
+ );
+
+ const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+ const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+#endif
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+ const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
+ const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
+
+ __m256i scale1 = _mm256_shuffle_epi8(scales, scales_idx1);
+ __m256i scale2 = _mm256_shuffle_epi8(scales, scales_idx2);
+
+ scales_idx1 = _mm256_add_epi8(scales_idx1, mtwo8);
+ scales_idx2 = _mm256_add_epi8(scales_idx2, mtwo8);
+
+ const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
+ const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
+ const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
+ const __m256i p4 = _mm256_madd_epi16(dot4, scale2);
+
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2));
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4));
+
+ qs += 8; qh += 4;
+ }
+
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
+
+ accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
+ accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
+ }
+
+ *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
+
+#elif defined __AVX__
+ const __m128i mask = _mm_set1_epi16(0x7);
+ const __m128i mone = _mm_set1_epi16(1);
+
+ __m256 accum1 = _mm256_setzero_ps();
+ __m256 accum2 = _mm256_setzero_ps();
+ for (int i = 0; i < nb; ++i) {
+
+ const int8_t * q8 = y[i].qs;
+ const uint8_t * qs = x[i].qs;
+ const uint8_t * qh = x[i].qh;
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
+
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+
+ __m128i sumi1_0 = _mm_setzero_si128();
+ __m128i sumi1_1 = _mm_setzero_si128();
+ __m128i sumi2_0 = _mm_setzero_si128();
+ __m128i sumi2_1 = _mm_setzero_si128();
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
+ const __m128i q1b_1_0 = _mm_set_epi64x(
+ iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
+ const __m128i q1b_1_1 = _mm_set_epi64x(
+ iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
+ const __m128i q1b_2_0 = _mm_set_epi64x(
+ iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
+ const __m128i q1b_2_1 = _mm_set_epi64x(
+ iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+
+ const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
+ const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
+ const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
+ const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
+
+ const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+ const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+ const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+ const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
+ qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+
+ const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
+ const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
+ const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
+ const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
+
+ __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
+ __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
+ __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
+ __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
+
+ scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
+ scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
+ scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
+ scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
+ const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
+ const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
+ const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
+ const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
+
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
+
+ qs += 8; qh += 4;
+ }
+
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
+
+ accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
+ accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
+ }
+
+ *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ UNUSED(scale);
+ ggml_vec_dot_iq1_m_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}
+
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+ assert(n % QK4_NL == 0);
+ static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
+
+ const block_iq4_nl * GGML_RESTRICT x = vx;
+ const block_q8_0 * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK4_NL;
+
+ int ib = 0;
+ float sumf = 0;
+
+#if defined __AVX2__
+
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+ const __m128i m4b = _mm_set1_epi8(0x0f);
+ const __m256i mone = _mm256_set1_epi16(1);
+
+ __m256 accum1 = _mm256_setzero_ps();
+ __m256 accum2 = _mm256_setzero_ps();
+ for (; ib + 1 < nb; ib += 2) {
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs);
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs);
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[ib + 0].qs);
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[ib + 1].qs);
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
+ accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
+ _mm256_cvtepi32_ps(p_1), accum1);
+ accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
+ _mm256_cvtepi32_ps(p_2), accum2);
+ }
+
+ sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+
+#elif defined __AVX__
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+ const __m128i m4b = _mm_set1_epi8(0x0f);
+
+ __m256 accum = _mm256_setzero_ps();
+ for (; ib + 1 < nb; ib += 2) {
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+ const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
+ const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
+ const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
+ const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
+
+ const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
+ const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+ }
+
+ sumf = hsum_float_8(accum);
+
+#endif
+ for (; ib < nb; ++ib) {
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
+ int sumi1 = 0, sumi2 = 0;
+ for (int j = 0; j < QK4_NL/2; ++j) {
+ sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
+ sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
+ }
+ sumf += d * (sumi1 + sumi2);
+ }
+ *s = sumf;
+}
+
+void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+ assert(nrc == 1);
+ UNUSED(nrc);
+ UNUSED(bx);
+ UNUSED(by);
+ UNUSED(bs);
+ assert(n % QK_K == 0);
+
+ const block_iq4_xs * GGML_RESTRICT x = vx;
+ const block_q8_K * GGML_RESTRICT y = vy;
+
+ const int nb = n / QK_K;
+
+#if defined __AVX2__
+
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+ const __m128i m4b = _mm_set1_epi8(0x0f);
+
+ __m256 accum = _mm256_setzero_ps();
+ for (int ibl = 0; ibl < nb; ++ibl) {
+ const uint8_t * qs = x[ibl].qs;
+ const int8_t * q8 = y[ibl].qs;
+ uint16_t sh = x[ibl].scales_h;
+ __m256i sumi1 = _mm256_setzero_si256();
+ __m256i sumi2 = _mm256_setzero_si256();
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
+ const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
+ const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
+ const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
+ _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
+ const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+ const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
+ sh >>= 4;
+ const __m256i p_1 = _mm256_madd_epi16(p16_1, _mm256_set1_epi16(ls1));
+ const __m256i p_2 = _mm256_madd_epi16(p16_2, _mm256_set1_epi16(ls2));
+ sumi1 = _mm256_add_epi32(p_1, sumi1);
+ sumi2 = _mm256_add_epi32(p_2, sumi2);
+ }
+ accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+ _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
+ }
+
+ *s = hsum_float_8(accum);
+
+#elif defined __AVX__
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
+ const __m128i m4b = _mm_set1_epi8(0x0f);
+
+ __m256 accum = _mm256_setzero_ps();
+ for (int ibl = 0; ibl < nb; ++ibl) {
+ const uint8_t * qs = x[ibl].qs;
+ const int8_t * q8 = y[ibl].qs;
+ uint16_t sh = x[ibl].scales_h;
+ __m128i sumi1_0 = _mm_setzero_si128();
+ __m128i sumi1_1 = _mm_setzero_si128();
+ __m128i sumi2_0 = _mm_setzero_si128();
+ __m128i sumi2_1 = _mm_setzero_si128();
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
+ const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
+ const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
+ const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
+ const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
+ const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
+ const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
+ const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
+ const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
+ const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
+ const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
+ sh >>= 4;
+ const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
+ const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
+ const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
+ const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
+ sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
+ sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
+ sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
+ sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
+ }
+ __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
+ __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
+ accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
+ _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
+ }
+
+ *s = hsum_float_8(accum);
+
+#else
+ UNUSED(x);
+ UNUSED(y);
+ UNUSED(nb);
+ ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc);
+#endif
+}