1 files changed, 176 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h b/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h
new file mode 100644
index 0000000..49f3efa
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h
@@ -0,0 +1,176 @@
+#ifndef HVX_INVERSE_H
+#define HVX_INVERSE_H
+#include <HAP_farf.h>
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "hvx-base.h"
+// ====================================================
+// FUNCTION: 1/(x+1)     y(0) = 1,  y(0.5) = 0.6667, y(1) = 0.5
+// Order:3; continuity: True; Ends forced: True
+// Mode: unsigned;   Result fractional bits: 14
+// Peak Error: 1.1295e-04  Rms Error: 2.8410e-05   Mean Error: 1.1370e-05
+//      32769  -32706   31252  -10589
+//      32590  -30635   22793   -4493
+//      32066  -27505   16481   -2348
+//      31205  -24054   11849   -1306
+static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) {
+    // input is 0..0xffff representing 0.0  .. 1.0
+    HVX_Vector p;
+    p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull);
+    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull);
+    p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull);
+    p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull);
+    return p;  // signed result, 14 fractional bits
+}
+// Find reciprocal of fp16.
+// (1) first, convert to fp32, multiplying by 1.0; this is done to
+//    handle denormals. Ignoring sign and zero, result should be at
+//    least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000)
+//    (exponent in range [103,143])
+// (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly
+// (3) put this, along with '253-exp' (exp from (1)) together to make an qf32
+// (4) convert that to fp16
+// (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace
+//     the result with the max value.
+static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) {
+    HVX_Vector     em_mask  = Q6_Vh_vsplat_R(0x7FFF);
+    HVX_Vector     avals    = Q6_V_vand_VV(vals, em_mask);
+    HVX_VectorPred is_neg   = Q6_Q_vcmp_gt_VhVh(avals, vals);
+    // is too small to 1/x ? for 'standard' fp16, this would be 0x101
+    HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals);
+    HVX_VectorPair to_qf32  = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00));  // *1.0
+    HVX_Vector     to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32));
+    HVX_Vector     to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32));
+    // bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector
+    HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9));
+    // likewise extract the upper 16 from each, containing the exponents in range 103..142
+    HVX_Vector exp_u16  = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0);
+    //Get exponent in IEEE 32-bit representation
+    exp_u16             = Q6_Vuh_vlsr_VuhR(exp_u16, 7);
+    // so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane
+    // We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0)
+    // Use poly to transform to 1/x, with 14 fractional bits
+    //
+    HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16);
+    HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm);  //count leading zeros
+    // Get mantissa for 16-bit represenation
+    HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
+    //Compute Reciprocal Exponent
+    HVX_Vector exp_recip =
+        Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1)));
+    //Convert it for 16-bit representation
+    exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15));
+    exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10);
+    //Merge exponent and mantissa for reciprocal
+    HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip);
+    // map 'small' inputs to standard largest value 0x7bff
+    recip            = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip);
+    // add sign back
+    recip            = Q6_V_vandor_VQR(recip, is_neg, 0x80008000);
+    return recip;
+}
+static inline HVX_Vector hvx_vec_inverse_f32(HVX_Vector v_sf) {
+    HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3);
+    HVX_Vector two_sf       = hvx_vec_splat_f32(2.0);
+    // First approximation
+    HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf);
+    HVX_Vector r_qf;
+    // Refine
+    r_qf = Q6_Vqf32_vmpy_VsfVsf(
+        i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf)))));
+    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
+        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
+    r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
+        r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
+    return Q6_Vsf_equals_Vqf32(r_qf);
+}
+static inline HVX_Vector hvx_vec_inverse_f32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
+    HVX_Vector out = hvx_vec_inverse_f32(v_sf);
+    HVX_Vector     masked_out = Q6_V_vand_VV(out, nan_inf_mask);
+    const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
+    return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
+}
+#define hvx_inverse_f32_loop_body(dst_type, src_type, vec_store)             \
+    do {                                                                     \
+        dst_type * restrict vdst = (dst_type *) dst;                         \
+        src_type * restrict vsrc = (src_type *) src;                         \
+                                                                             \
+        const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000);           \
+                                                                             \
+        const uint32_t nvec = n / VLEN_FP32;                                 \
+        const uint32_t nloe = n % VLEN_FP32;                                 \
+                                                                             \
+        uint32_t i = 0;                                                      \
+                                                                             \
+        _Pragma("unroll(4)")                                                 \
+        for (; i < nvec; i++) {                                              \
+             vdst[i] = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask);     \
+        }                                                                    \
+        if (nloe) {                                                          \
+            HVX_Vector v = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \
+            vec_store((void *) &vdst[i], nloe * SIZEOF_FP32, v);             \
+        }                                                                    \
+    } while(0)
+static inline void hvx_inverse_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    assert((unsigned long) src % 128 == 0);
+    hvx_inverse_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
+}
+static inline void hvx_inverse_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) dst % 128 == 0);
+    hvx_inverse_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
+}
+static inline void hvx_inverse_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    assert((unsigned long) src % 128 == 0);
+    hvx_inverse_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
+}
+static inline void hvx_inverse_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
+    hvx_inverse_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
+}
+static inline void hvx_inverse_f32(uint8_t * restrict dst, uint8_t * restrict src, const int num_elems) {
+    if ((unsigned long) dst % 128 == 0) {
+        if ((unsigned long) src % 128 == 0) {
+            hvx_inverse_f32_aa(dst, src, num_elems);
+        } else {
+            hvx_inverse_f32_au(dst, src, num_elems);
+        }
+    } else {
+        if ((unsigned long) src % 128 == 0) {
+            hvx_inverse_f32_ua(dst, src, num_elems);
+        } else {
+            hvx_inverse_f32_uu(dst, src, num_elems);
+        }
+    }
+}
+#endif // HVX_INVERSE_H

diff --git a/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h b/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h new file mode 100644 index 0000000..49f3efa --- /dev/null +++ b/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h
@@ -0,0 +1,176 @@
	1	#ifndef HVX_INVERSE_H
	2	#define HVX_INVERSE_H
	3
	4	#include <HAP_farf.h>
	5
	6	#include <math.h>
	7	#include <string.h>
	8	#include <assert.h>
	9	#include <stddef.h>
	10	#include <stdint.h>
	11
	12	#include "hvx-base.h"
	13
	14	// ====================================================
	15	// FUNCTION: 1/(x+1) y(0) = 1, y(0.5) = 0.6667, y(1) = 0.5
	16	// Order:3; continuity: True; Ends forced: True
	17	// Mode: unsigned; Result fractional bits: 14
	18	// Peak Error: 1.1295e-04 Rms Error: 2.8410e-05 Mean Error: 1.1370e-05
	19	// 32769 -32706 31252 -10589
	20	// 32590 -30635 22793 -4493
	21	// 32066 -27505 16481 -2348
	22	// 31205 -24054 11849 -1306
	23
	24	static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) {
	25	// input is 0..0xffff representing 0.0 .. 1.0
	26	HVX_Vector p;
	27	p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull);
	28	p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull);
	29	p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull);
	30	p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull);
	31	return p; // signed result, 14 fractional bits
	32	}
	33
	34	// Find reciprocal of fp16.
	35	// (1) first, convert to fp32, multiplying by 1.0; this is done to
	36	// handle denormals. Ignoring sign and zero, result should be at
	37	// least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000)
	38	// (exponent in range [103,143])
	39	// (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly
	40	// (3) put this, along with '253-exp' (exp from (1)) together to make an qf32
	41	// (4) convert that to fp16
	42	// (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace
	43	// the result with the max value.
	44	static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) {
	45	HVX_Vector em_mask = Q6_Vh_vsplat_R(0x7FFF);
	46	HVX_Vector avals = Q6_V_vand_VV(vals, em_mask);
	47	HVX_VectorPred is_neg = Q6_Q_vcmp_gt_VhVh(avals, vals);
	48	// is too small to 1/x ? for 'standard' fp16, this would be 0x101
	49	HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals);
	50
	51	HVX_VectorPair to_qf32 = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00)); // *1.0
	52	HVX_Vector to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32));
	53	HVX_Vector to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32));
	54
	55	// bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector
	56	HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9));
	57	// likewise extract the upper 16 from each, containing the exponents in range 103..142
	58	HVX_Vector exp_u16 = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0);
	59	//Get exponent in IEEE 32-bit representation
	60	exp_u16 = Q6_Vuh_vlsr_VuhR(exp_u16, 7);
	61
	62	// so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane
	63	// We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0)
	64	// Use poly to transform to 1/x, with 14 fractional bits
	65	//
	66	HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16);
	67
	68	HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm); //count leading zeros
	69
	70	// Get mantissa for 16-bit represenation
	71	HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
	72
	73	//Compute Reciprocal Exponent
	74	HVX_Vector exp_recip =
	75	Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1)));
	76	//Convert it for 16-bit representation
	77	exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15));
	78	exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10);
	79
	80	//Merge exponent and mantissa for reciprocal
	81	HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip);
	82	// map 'small' inputs to standard largest value 0x7bff
	83	recip = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip);
	84	// add sign back
	85	recip = Q6_V_vandor_VQR(recip, is_neg, 0x80008000);
	86	return recip;
	87	}
	88
	89	static inline HVX_Vector hvx_vec_inverse_f32(HVX_Vector v_sf) {
	90	HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3);
	91	HVX_Vector two_sf = hvx_vec_splat_f32(2.0);
	92
	93	// First approximation
	94	HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf);
	95
	96	HVX_Vector r_qf;
	97
	98	// Refine
	99	r_qf = Q6_Vqf32_vmpy_VsfVsf(
	100	i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf)))));
	101	r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
	102	r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
	103	r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
	104	r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
	105
	106	return Q6_Vsf_equals_Vqf32(r_qf);
	107	}
	108
	109	static inline HVX_Vector hvx_vec_inverse_f32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
	110	HVX_Vector out = hvx_vec_inverse_f32(v_sf);
	111
	112	HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask);
	113	const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
	114
	115	return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
	116	}
	117
	118	#define hvx_inverse_f32_loop_body(dst_type, src_type, vec_store) \
	119	do { \
	120	dst_type * restrict vdst = (dst_type *) dst; \
	121	src_type * restrict vsrc = (src_type *) src; \
	122	\
	123	const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \
	124	\
	125	const uint32_t nvec = n / VLEN_FP32; \
	126	const uint32_t nloe = n % VLEN_FP32; \
	127	\
	128	uint32_t i = 0; \
	129	\
	130	_Pragma("unroll(4)") \
	131	for (; i < nvec; i++) { \
	132	vdst[i] = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \
	133	} \
	134	if (nloe) { \
	135	HVX_Vector v = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \
	136	vec_store((void ) &vdst[i], nloe SIZEOF_FP32, v); \
	137	} \
	138	} while(0)
	139
	140	static inline void hvx_inverse_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
	141	assert((unsigned long) dst % 128 == 0);
	142	assert((unsigned long) src % 128 == 0);
	143	hvx_inverse_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
	144	}
	145
	146	static inline void hvx_inverse_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
	147	assert((unsigned long) dst % 128 == 0);
	148	hvx_inverse_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
	149	}
	150
	151	static inline void hvx_inverse_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
	152	assert((unsigned long) src % 128 == 0);
	153	hvx_inverse_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
	154	}
	155
	156	static inline void hvx_inverse_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
	157	hvx_inverse_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
	158	}
	159
	160	static inline void hvx_inverse_f32(uint8_t * restrict dst, uint8_t * restrict src, const int num_elems) {
	161	if ((unsigned long) dst % 128 == 0) {
	162	if ((unsigned long) src % 128 == 0) {
	163	hvx_inverse_f32_aa(dst, src, num_elems);
	164	} else {
	165	hvx_inverse_f32_au(dst, src, num_elems);
	166	}
	167	} else {
	168	if ((unsigned long) src % 128 == 0) {
	169	hvx_inverse_f32_ua(dst, src, num_elems);
	170	} else {
	171	hvx_inverse_f32_uu(dst, src, num_elems);
	172	}
	173	}
	174	}
	175
	176	#endif // HVX_INVERSE_H