1#ifndef HVX_DIV_H
2#define HVX_DIV_H
3
4#include <HAP_farf.h>
5
6#include <math.h>
7#include <string.h>
8#include <assert.h>
9#include <stddef.h>
10#include <stdint.h>
11
12#include "hvx-base.h"
13#include "hex-utils.h"
14#include "hvx-inverse.h"
15#include "hvx-arith.h"
16
17#if __HVX_ARCH__ < 79
18#define HVX_OP_MUL(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b))
19#else
20#define HVX_OP_MUL(a, b) Q6_Vsf_vmpy_VsfVsf(a, b)
21#endif
22
23#define hvx_div_f32_loop_body(dst_type, src0_type, src1_type, vec_store) \
24 do { \
25 dst_type * restrict vdst = (dst_type *) dst; \
26 src0_type * restrict vsrc0 = (src0_type *) src0; \
27 src1_type * restrict vsrc1 = (src1_type *) src1; \
28 \
29 const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \
30 \
31 const uint32_t nvec = n / VLEN_FP32; \
32 const uint32_t nloe = n % VLEN_FP32; \
33 \
34 uint32_t i = 0; \
35 \
36 _Pragma("unroll(4)") \
37 for (; i < nvec; i++) { \
38 HVX_Vector inv_src1 = hvx_vec_inverse_f32_guard(vsrc1[i], nan_inf_mask); \
39 HVX_Vector res = HVX_OP_MUL(vsrc0[i], inv_src1); \
40 vdst[i] = res; \
41 } \
42 if (nloe) { \
43 HVX_Vector inv_src1 = hvx_vec_inverse_f32_guard(vsrc1[i], nan_inf_mask); \
44 HVX_Vector res = HVX_OP_MUL(vsrc0[i], inv_src1); \
45 vec_store((void *) &vdst[i], nloe * SIZEOF_FP32, res); \
46 } \
47 } while(0)
48
49// 3-letter suffix variants
50static inline void hvx_div_f32_aaa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
51 assert((uintptr_t) dst % 128 == 0);
52 assert((uintptr_t) src0 % 128 == 0);
53 assert((uintptr_t) src1 % 128 == 0);
54 hvx_div_f32_loop_body(HVX_Vector, HVX_Vector, HVX_Vector, hvx_vec_store_a);
55}
56
57static inline void hvx_div_f32_aau(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
58 assert((uintptr_t) dst % 128 == 0);
59 assert((uintptr_t) src0 % 128 == 0);
60 hvx_div_f32_loop_body(HVX_Vector, HVX_Vector, HVX_UVector, hvx_vec_store_a);
61}
62
63static inline void hvx_div_f32_aua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
64 assert((uintptr_t) dst % 128 == 0);
65 assert((uintptr_t) src1 % 128 == 0);
66 hvx_div_f32_loop_body(HVX_Vector, HVX_UVector, HVX_Vector, hvx_vec_store_a);
67}
68
69static inline void hvx_div_f32_auu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
70 assert((uintptr_t) dst % 128 == 0);
71 hvx_div_f32_loop_body(HVX_Vector, HVX_UVector, HVX_UVector, hvx_vec_store_a);
72}
73
74static inline void hvx_div_f32_uaa(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
75 assert((uintptr_t) src0 % 128 == 0);
76 assert((uintptr_t) src1 % 128 == 0);
77 hvx_div_f32_loop_body(HVX_UVector, HVX_Vector, HVX_Vector, hvx_vec_store_u);
78}
79
80static inline void hvx_div_f32_uau(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
81 assert((uintptr_t) src0 % 128 == 0);
82 hvx_div_f32_loop_body(HVX_UVector, HVX_Vector, HVX_UVector, hvx_vec_store_u);
83}
84
85static inline void hvx_div_f32_uua(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
86 assert((uintptr_t) src1 % 128 == 0);
87 hvx_div_f32_loop_body(HVX_UVector, HVX_UVector, HVX_Vector, hvx_vec_store_u);
88}
89
90static inline void hvx_div_f32_uuu(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, uint32_t n) {
91 hvx_div_f32_loop_body(HVX_UVector, HVX_UVector, HVX_UVector, hvx_vec_store_u);
92}
93
94static inline void hvx_div_f32(uint8_t * restrict dst, const uint8_t * restrict src0, const uint8_t * restrict src1, const uint32_t num_elems) {
95 if (hex_is_aligned((void *) dst, 128)) {
96 if (hex_is_aligned((void *) src0, 128)) {
97 if (hex_is_aligned((void *) src1, 128)) hvx_div_f32_aaa(dst, src0, src1, num_elems);
98 else hvx_div_f32_aau(dst, src0, src1, num_elems);
99 } else {
100 if (hex_is_aligned((void *) src1, 128)) hvx_div_f32_aua(dst, src0, src1, num_elems);
101 else hvx_div_f32_auu(dst, src0, src1, num_elems);
102 }
103 } else {
104 if (hex_is_aligned((void *) src0, 128)) {
105 if (hex_is_aligned((void *) src1, 128)) hvx_div_f32_uaa(dst, src0, src1, num_elems);
106 else hvx_div_f32_uau(dst, src0, src1, num_elems);
107 } else {
108 if (hex_is_aligned((void *) src1, 128)) hvx_div_f32_uua(dst, src0, src1, num_elems);
109 else hvx_div_f32_uuu(dst, src0, src1, num_elems);
110 }
111 }
112}
113
114#undef HVX_OP_MUL
115
116#endif // HVX_DIV_H