aboutsummaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h')
-rw-r--r--llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h176
1 files changed, 176 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h b/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h
new file mode 100644
index 0000000..49f3efa
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h
@@ -0,0 +1,176 @@
1#ifndef HVX_INVERSE_H
2#define HVX_INVERSE_H
3
4#include <HAP_farf.h>
5
6#include <math.h>
7#include <string.h>
8#include <assert.h>
9#include <stddef.h>
10#include <stdint.h>
11
12#include "hvx-base.h"
13
14// ====================================================
15// FUNCTION: 1/(x+1) y(0) = 1, y(0.5) = 0.6667, y(1) = 0.5
16// Order:3; continuity: True; Ends forced: True
17// Mode: unsigned; Result fractional bits: 14
18// Peak Error: 1.1295e-04 Rms Error: 2.8410e-05 Mean Error: 1.1370e-05
19// 32769 -32706 31252 -10589
20// 32590 -30635 22793 -4493
21// 32066 -27505 16481 -2348
22// 31205 -24054 11849 -1306
23
24static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) {
25 // input is 0..0xffff representing 0.0 .. 1.0
26 HVX_Vector p;
27 p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull);
28 p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull);
29 p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull);
30 p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull);
31 return p; // signed result, 14 fractional bits
32}
33
34// Find reciprocal of fp16.
35// (1) first, convert to fp32, multiplying by 1.0; this is done to
36// handle denormals. Ignoring sign and zero, result should be at
37// least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000)
38// (exponent in range [103,143])
39// (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly
40// (3) put this, along with '253-exp' (exp from (1)) together to make an qf32
41// (4) convert that to fp16
42// (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace
43// the result with the max value.
44static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) {
45 HVX_Vector em_mask = Q6_Vh_vsplat_R(0x7FFF);
46 HVX_Vector avals = Q6_V_vand_VV(vals, em_mask);
47 HVX_VectorPred is_neg = Q6_Q_vcmp_gt_VhVh(avals, vals);
48 // is too small to 1/x ? for 'standard' fp16, this would be 0x101
49 HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals);
50
51 HVX_VectorPair to_qf32 = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00)); // *1.0
52 HVX_Vector to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32));
53 HVX_Vector to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32));
54
55 // bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector
56 HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9));
57 // likewise extract the upper 16 from each, containing the exponents in range 103..142
58 HVX_Vector exp_u16 = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0);
59 //Get exponent in IEEE 32-bit representation
60 exp_u16 = Q6_Vuh_vlsr_VuhR(exp_u16, 7);
61
62 // so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane
63 // We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0)
64 // Use poly to transform to 1/x, with 14 fractional bits
65 //
66 HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16);
67
68 HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm); //count leading zeros
69
70 // Get mantissa for 16-bit represenation
71 HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
72
73 //Compute Reciprocal Exponent
74 HVX_Vector exp_recip =
75 Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1)));
76 //Convert it for 16-bit representation
77 exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15));
78 exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10);
79
80 //Merge exponent and mantissa for reciprocal
81 HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip);
82 // map 'small' inputs to standard largest value 0x7bff
83 recip = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip);
84 // add sign back
85 recip = Q6_V_vandor_VQR(recip, is_neg, 0x80008000);
86 return recip;
87}
88
89static inline HVX_Vector hvx_vec_inverse_f32(HVX_Vector v_sf) {
90 HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3);
91 HVX_Vector two_sf = hvx_vec_splat_f32(2.0);
92
93 // First approximation
94 HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf);
95
96 HVX_Vector r_qf;
97
98 // Refine
99 r_qf = Q6_Vqf32_vmpy_VsfVsf(
100 i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf)))));
101 r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
102 r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
103 r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
104 r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
105
106 return Q6_Vsf_equals_Vqf32(r_qf);
107}
108
109static inline HVX_Vector hvx_vec_inverse_f32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
110 HVX_Vector out = hvx_vec_inverse_f32(v_sf);
111
112 HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask);
113 const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
114
115 return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
116}
117
118#define hvx_inverse_f32_loop_body(dst_type, src_type, vec_store) \
119 do { \
120 dst_type * restrict vdst = (dst_type *) dst; \
121 src_type * restrict vsrc = (src_type *) src; \
122 \
123 const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \
124 \
125 const uint32_t nvec = n / VLEN_FP32; \
126 const uint32_t nloe = n % VLEN_FP32; \
127 \
128 uint32_t i = 0; \
129 \
130 _Pragma("unroll(4)") \
131 for (; i < nvec; i++) { \
132 vdst[i] = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \
133 } \
134 if (nloe) { \
135 HVX_Vector v = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \
136 vec_store((void *) &vdst[i], nloe * SIZEOF_FP32, v); \
137 } \
138 } while(0)
139
140static inline void hvx_inverse_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
141 assert((unsigned long) dst % 128 == 0);
142 assert((unsigned long) src % 128 == 0);
143 hvx_inverse_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
144}
145
146static inline void hvx_inverse_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
147 assert((unsigned long) dst % 128 == 0);
148 hvx_inverse_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
149}
150
151static inline void hvx_inverse_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
152 assert((unsigned long) src % 128 == 0);
153 hvx_inverse_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
154}
155
156static inline void hvx_inverse_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
157 hvx_inverse_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
158}
159
160static inline void hvx_inverse_f32(uint8_t * restrict dst, uint8_t * restrict src, const int num_elems) {
161 if ((unsigned long) dst % 128 == 0) {
162 if ((unsigned long) src % 128 == 0) {
163 hvx_inverse_f32_aa(dst, src, num_elems);
164 } else {
165 hvx_inverse_f32_au(dst, src, num_elems);
166 }
167 } else {
168 if ((unsigned long) src % 128 == 0) {
169 hvx_inverse_f32_ua(dst, src, num_elems);
170 } else {
171 hvx_inverse_f32_uu(dst, src, num_elems);
172 }
173 }
174}
175
176#endif // HVX_INVERSE_H