llmnpc - llama.cpp/ggml/src/ggml-hexagon/htp/hvx-div.h

Path: llmnpc / llama.cpp / ggml / src / ggml-hexagon / htp / hvx-div.h (raw)
  1#ifndef HVX_DIV_H
  2#define HVX_DIV_H
  3
  4#include <HAP_farf.h>
  5
  6#include <math.h>
  7#include <string.h>
  8#include <assert.h>
  9#include <stddef.h>
 10#include <stdint.h>
 11
 12#include "hvx-base.h"
 13#include "hex-utils.h"
 14#include "hvx-inverse.h"
 15#include "hvx-arith.h"
 16
 17#if __HVX_ARCH__ < 79
 18#define HVX_OP_MUL(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b))
 19#else
 20#define HVX_OP_MUL(a, b) Q6_Vsf_vmpy_VsfVsf(a, b)
 21#endif
 22
 23#define hvx_div_f32_loop_body(dst_type, src0_type, src1_type, vec_store)             \
 24    do {                                                                             \
 25        dst_type * restrict vdst = (dst_type *) dst;                                 \
 26        src0_type * restrict vsrc0 = (src0_type *) src0;                             \
 27        src1_type * restrict vsrc1 = (src1_type *) src1;                             \
 28                                                                                     \
 29        const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000);                   \
 30                                                                                     \
 31        const uint32_t nvec = n / VLEN_FP32;                                         \
 32        const uint32_t nloe = n % VLEN_FP32;                                         \
 33                                                                                     \
 34        uint32_t i = 0;                                                              \
 35                                                                                     \
 36        _Pragma("unroll(4)")                                                         \
 37        for (; i < nvec; i++) {                                                      \
 38            HVX_Vector inv_src1 = hvx_vec_inverse_f32_guard(vsrc1[i], nan_inf_mask); \
 39            HVX_Vector res = HVX_OP_MUL(vsrc0[i], inv_src1);                         \
 40            vdst[i] = res;                                                           \
 41        }                                                                            \
 42        if (nloe) {                                                                  \
 43            HVX_Vector inv_src1 = hvx_vec_inverse_f32_guard(vsrc1[i], nan_inf_mask); \
 44            HVX_Vector res = HVX_OP_MUL(vsrc0[i], inv_src1);                         \
 45            vec_store((void *) &vdst[i], nloe * SIZEOF_FP32, res);                   \
 46        }                                                                            \
 47    } while(0)
 48
 49// 3-letter suffix variants
 50static inline void hvx_div_f32_aaa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
 51    assert((uintptr_t) dst % 128 == 0);
 52    assert((uintptr_t) src0 % 128 == 0);
 53    assert((uintptr_t) src1 % 128 == 0);
 54    hvx_div_f32_loop_body(HVX_Vector, HVX_Vector, HVX_Vector, hvx_vec_store_a);
 55}
 56
 57static inline void hvx_div_f32_aau(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
 58    assert((uintptr_t) dst % 128 == 0);
 59    assert((uintptr_t) src0 % 128 == 0);
 60    hvx_div_f32_loop_body(HVX_Vector, HVX_Vector, HVX_UVector, hvx_vec_store_a);
 61}
 62
 63static inline void hvx_div_f32_aua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
 64    assert((uintptr_t) dst % 128 == 0);
 65    assert((uintptr_t) src1 % 128 == 0);
 66    hvx_div_f32_loop_body(HVX_Vector, HVX_UVector, HVX_Vector, hvx_vec_store_a);
 67}
 68
 69static inline void hvx_div_f32_auu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
 70    assert((uintptr_t) dst % 128 == 0);
 71    hvx_div_f32_loop_body(HVX_Vector, HVX_UVector, HVX_UVector, hvx_vec_store_a);
 72}
 73
 74static inline void hvx_div_f32_uaa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
 75    assert((uintptr_t) src0 % 128 == 0);
 76    assert((uintptr_t) src1 % 128 == 0);
 77    hvx_div_f32_loop_body(HVX_UVector, HVX_Vector, HVX_Vector, hvx_vec_store_u);
 78}
 79
 80static inline void hvx_div_f32_uau(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
 81    assert((uintptr_t) src0 % 128 == 0);
 82    hvx_div_f32_loop_body(HVX_UVector, HVX_Vector, HVX_UVector, hvx_vec_store_u);
 83}
 84
 85static inline void hvx_div_f32_uua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
 86    assert((uintptr_t) src1 % 128 == 0);
 87    hvx_div_f32_loop_body(HVX_UVector, HVX_UVector, HVX_Vector, hvx_vec_store_u);
 88}
 89
 90static inline void hvx_div_f32_uuu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
 91    hvx_div_f32_loop_body(HVX_UVector, HVX_UVector, HVX_UVector, hvx_vec_store_u);
 92}
 93
 94static inline void hvx_div_f32(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint32_t num_elems) {
 95    if (hex_is_aligned((void *) dst, 128)) {
 96        if (hex_is_aligned((void *) src0, 128)) {
 97            if (hex_is_aligned((void *) src1, 128)) hvx_div_f32_aaa(dst, src0, src1, num_elems);
 98            else                                    hvx_div_f32_aau(dst, src0, src1, num_elems);
 99        } else {
100            if (hex_is_aligned((void *) src1, 128)) hvx_div_f32_aua(dst, src0, src1, num_elems);
101            else                                    hvx_div_f32_auu(dst, src0, src1, num_elems);
102        }
103    } else {
104        if (hex_is_aligned((void *) src0, 128)) {
105            if (hex_is_aligned((void *) src1, 128)) hvx_div_f32_uaa(dst, src0, src1, num_elems);
106            else                                    hvx_div_f32_uau(dst, src0, src1, num_elems);
107        } else {
108            if (hex_is_aligned((void *) src1, 128)) hvx_div_f32_uua(dst, src0, src1, num_elems);
109            else                                    hvx_div_f32_uuu(dst, src0, src1, num_elems);
110        }
111    }
112}
113
114#undef HVX_OP_MUL
115
116#endif // HVX_DIV_H