diff options
Diffstat (limited to 'llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h')
| -rw-r--r-- | llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h b/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h new file mode 100644 index 0000000..49f3efa --- /dev/null +++ b/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h | |||
| @@ -0,0 +1,176 @@ | |||
| 1 | #ifndef HVX_INVERSE_H | ||
| 2 | #define HVX_INVERSE_H | ||
| 3 | |||
| 4 | #include <HAP_farf.h> | ||
| 5 | |||
| 6 | #include <math.h> | ||
| 7 | #include <string.h> | ||
| 8 | #include <assert.h> | ||
| 9 | #include <stddef.h> | ||
| 10 | #include <stdint.h> | ||
| 11 | |||
| 12 | #include "hvx-base.h" | ||
| 13 | |||
| 14 | // ==================================================== | ||
| 15 | // FUNCTION: 1/(x+1) y(0) = 1, y(0.5) = 0.6667, y(1) = 0.5 | ||
| 16 | // Order:3; continuity: True; Ends forced: True | ||
| 17 | // Mode: unsigned; Result fractional bits: 14 | ||
| 18 | // Peak Error: 1.1295e-04 Rms Error: 2.8410e-05 Mean Error: 1.1370e-05 | ||
| 19 | // 32769 -32706 31252 -10589 | ||
| 20 | // 32590 -30635 22793 -4493 | ||
| 21 | // 32066 -27505 16481 -2348 | ||
| 22 | // 31205 -24054 11849 -1306 | ||
| 23 | |||
| 24 | static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) { | ||
| 25 | // input is 0..0xffff representing 0.0 .. 1.0 | ||
| 26 | HVX_Vector p; | ||
| 27 | p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull); | ||
| 28 | p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull); | ||
| 29 | p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull); | ||
| 30 | p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull); | ||
| 31 | return p; // signed result, 14 fractional bits | ||
| 32 | } | ||
| 33 | |||
| 34 | // Find reciprocal of fp16. | ||
| 35 | // (1) first, convert to fp32, multiplying by 1.0; this is done to | ||
| 36 | // handle denormals. Ignoring sign and zero, result should be at | ||
| 37 | // least 5.9604645e-08 (32-bit code 0x33800000) and at most 131008 (0x47ffe000) | ||
| 38 | // (exponent in range [103,143]) | ||
| 39 | // (2) extract the mantissa into 16-bit unsigned; find reciprocal using a fitted poly | ||
| 40 | // (3) put this, along with '253-exp' (exp from (1)) together to make an qf32 | ||
| 41 | // (4) convert that to fp16 | ||
| 42 | // (5) put sign back in. Also, if the original value (w/o sign) was <0x81, replace | ||
| 43 | // the result with the max value. | ||
| 44 | static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) { | ||
| 45 | HVX_Vector em_mask = Q6_Vh_vsplat_R(0x7FFF); | ||
| 46 | HVX_Vector avals = Q6_V_vand_VV(vals, em_mask); | ||
| 47 | HVX_VectorPred is_neg = Q6_Q_vcmp_gt_VhVh(avals, vals); | ||
| 48 | // is too small to 1/x ? for 'standard' fp16, this would be 0x101 | ||
| 49 | HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals); | ||
| 50 | |||
| 51 | HVX_VectorPair to_qf32 = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00)); // *1.0 | ||
| 52 | HVX_Vector to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32)); | ||
| 53 | HVX_Vector to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32)); | ||
| 54 | |||
| 55 | // bits 22..13 contain the mantissa now (w/o hidden bit); move to bit 14..5 of a 16-bit vector | ||
| 56 | HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9)); | ||
| 57 | // likewise extract the upper 16 from each, containing the exponents in range 103..142 | ||
| 58 | HVX_Vector exp_u16 = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0); | ||
| 59 | //Get exponent in IEEE 32-bit representation | ||
| 60 | exp_u16 = Q6_Vuh_vlsr_VuhR(exp_u16, 7); | ||
| 61 | |||
| 62 | // so, mant_u16 contains an unbiased mantissa in upper 10 bits of each u16 lane | ||
| 63 | // We can consider it to be x-1.0, with 16 fractional bits, where 'x' is in range [1.0,2.0) | ||
| 64 | // Use poly to transform to 1/x, with 14 fractional bits | ||
| 65 | // | ||
| 66 | HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16); | ||
| 67 | |||
| 68 | HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm); //count leading zeros | ||
| 69 | |||
| 70 | // Get mantissa for 16-bit represenation | ||
| 71 | HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF)); | ||
| 72 | |||
| 73 | //Compute Reciprocal Exponent | ||
| 74 | HVX_Vector exp_recip = | ||
| 75 | Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1))); | ||
| 76 | //Convert it for 16-bit representation | ||
| 77 | exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15)); | ||
| 78 | exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10); | ||
| 79 | |||
| 80 | //Merge exponent and mantissa for reciprocal | ||
| 81 | HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip); | ||
| 82 | // map 'small' inputs to standard largest value 0x7bff | ||
| 83 | recip = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip); | ||
| 84 | // add sign back | ||
| 85 | recip = Q6_V_vandor_VQR(recip, is_neg, 0x80008000); | ||
| 86 | return recip; | ||
| 87 | } | ||
| 88 | |||
| 89 | static inline HVX_Vector hvx_vec_inverse_f32(HVX_Vector v_sf) { | ||
| 90 | HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3); | ||
| 91 | HVX_Vector two_sf = hvx_vec_splat_f32(2.0); | ||
| 92 | |||
| 93 | // First approximation | ||
| 94 | HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf); | ||
| 95 | |||
| 96 | HVX_Vector r_qf; | ||
| 97 | |||
| 98 | // Refine | ||
| 99 | r_qf = Q6_Vqf32_vmpy_VsfVsf( | ||
| 100 | i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf))))); | ||
| 101 | r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32( | ||
| 102 | r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf)))); | ||
| 103 | r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32( | ||
| 104 | r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf)))); | ||
| 105 | |||
| 106 | return Q6_Vsf_equals_Vqf32(r_qf); | ||
| 107 | } | ||
| 108 | |||
| 109 | static inline HVX_Vector hvx_vec_inverse_f32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) { | ||
| 110 | HVX_Vector out = hvx_vec_inverse_f32(v_sf); | ||
| 111 | |||
| 112 | HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask); | ||
| 113 | const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out); | ||
| 114 | |||
| 115 | return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out); | ||
| 116 | } | ||
| 117 | |||
| 118 | #define hvx_inverse_f32_loop_body(dst_type, src_type, vec_store) \ | ||
| 119 | do { \ | ||
| 120 | dst_type * restrict vdst = (dst_type *) dst; \ | ||
| 121 | src_type * restrict vsrc = (src_type *) src; \ | ||
| 122 | \ | ||
| 123 | const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \ | ||
| 124 | \ | ||
| 125 | const uint32_t nvec = n / VLEN_FP32; \ | ||
| 126 | const uint32_t nloe = n % VLEN_FP32; \ | ||
| 127 | \ | ||
| 128 | uint32_t i = 0; \ | ||
| 129 | \ | ||
| 130 | _Pragma("unroll(4)") \ | ||
| 131 | for (; i < nvec; i++) { \ | ||
| 132 | vdst[i] = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \ | ||
| 133 | } \ | ||
| 134 | if (nloe) { \ | ||
| 135 | HVX_Vector v = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \ | ||
| 136 | vec_store((void *) &vdst[i], nloe * SIZEOF_FP32, v); \ | ||
| 137 | } \ | ||
| 138 | } while(0) | ||
| 139 | |||
| 140 | static inline void hvx_inverse_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { | ||
| 141 | assert((unsigned long) dst % 128 == 0); | ||
| 142 | assert((unsigned long) src % 128 == 0); | ||
| 143 | hvx_inverse_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a); | ||
| 144 | } | ||
| 145 | |||
| 146 | static inline void hvx_inverse_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { | ||
| 147 | assert((unsigned long) dst % 128 == 0); | ||
| 148 | hvx_inverse_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a); | ||
| 149 | } | ||
| 150 | |||
| 151 | static inline void hvx_inverse_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { | ||
| 152 | assert((unsigned long) src % 128 == 0); | ||
| 153 | hvx_inverse_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u); | ||
| 154 | } | ||
| 155 | |||
| 156 | static inline void hvx_inverse_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { | ||
| 157 | hvx_inverse_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u); | ||
| 158 | } | ||
| 159 | |||
| 160 | static inline void hvx_inverse_f32(uint8_t * restrict dst, uint8_t * restrict src, const int num_elems) { | ||
| 161 | if ((unsigned long) dst % 128 == 0) { | ||
| 162 | if ((unsigned long) src % 128 == 0) { | ||
| 163 | hvx_inverse_f32_aa(dst, src, num_elems); | ||
| 164 | } else { | ||
| 165 | hvx_inverse_f32_au(dst, src, num_elems); | ||
| 166 | } | ||
| 167 | } else { | ||
| 168 | if ((unsigned long) src % 128 == 0) { | ||
| 169 | hvx_inverse_f32_ua(dst, src, num_elems); | ||
| 170 | } else { | ||
| 171 | hvx_inverse_f32_uu(dst, src, num_elems); | ||
| 172 | } | ||
| 173 | } | ||
| 174 | } | ||
| 175 | |||
| 176 | #endif // HVX_INVERSE_H | ||
