1#ifndef HVX_COPY_H
2#define HVX_COPY_H
3
4#include <assert.h>
5#include <stddef.h>
6#include <stdint.h>
7
8#include "hvx-base.h"
9
10#define hvx_splat_loop_body(dst_type, vec_store) \
11 do { \
12 dst_type * restrict vdst = (dst_type *) dst; \
13 \
14 uint32_t nvec = n / (128 / elem_size); \
15 uint32_t nloe = n % (128 / elem_size); \
16 \
17 uint32_t i = 0; \
18 \
19 _Pragma("unroll(4)") \
20 for (; i < nvec; i++) { \
21 vdst[i] = src; \
22 } \
23 if (nloe) { \
24 vec_store((void *) &vdst[i], nloe * elem_size, src); \
25 } \
26 } while(0)
27
28static inline void hvx_splat_a(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
29 assert((unsigned long) dst % 128 == 0);
30 hvx_splat_loop_body(HVX_Vector, hvx_vec_store_a);
31}
32
33static inline void hvx_splat_u(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
34 hvx_splat_loop_body(HVX_UVector, hvx_vec_store_u);
35}
36
37static inline void hvx_splat_f32_a(uint8_t * restrict dst, float v, uint32_t n) {
38 hvx_splat_a(dst, hvx_vec_splat_f32(v), n, sizeof(float));
39}
40
41static inline void hvx_splat_f32_u(uint8_t * restrict dst, float v, uint32_t n) {
42 hvx_splat_u(dst, hvx_vec_splat_f32(v), n, sizeof(float));
43}
44
45static inline void hvx_splat_f16_a(uint8_t * restrict dst, float v, uint32_t n) {
46 hvx_splat_u(dst, hvx_vec_splat_f16(v), n, sizeof(__fp16));
47}
48
49static inline void hvx_splat_f16_u(uint8_t * restrict dst, float v, uint32_t n) {
50 hvx_splat_u(dst, hvx_vec_splat_f16(v), n, sizeof(__fp16));
51}
52
53#define hvx_copy_loop_body(dst_type, src_type, vec_store) \
54 do { \
55 dst_type * restrict vdst = (dst_type *) dst; \
56 src_type * restrict vsrc = (src_type *) src; \
57 \
58 const uint32_t epv = 128 / elem_size; \
59 const uint32_t nvec = n / epv; \
60 const uint32_t nloe = n % epv; \
61 \
62 uint32_t i = 0; \
63 \
64 _Pragma("unroll(4)") \
65 for (; i < nvec; i++) { vdst[i] = vsrc[i]; } \
66 if (nloe) { \
67 vec_store((void *) &vdst[i], nloe * elem_size, vsrc[i]); \
68 } \
69 } while(0)
70
71// Generic copy routines
72static inline void hvx_copy_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
73 assert((unsigned long) dst % 128 == 0);
74 assert((unsigned long) src % 128 == 0);
75 hvx_copy_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
76}
77
78static inline void hvx_copy_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
79 assert((unsigned long) dst % 128 == 0);
80 hvx_copy_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
81}
82
83static inline void hvx_copy_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
84 assert((unsigned long) src % 128 == 0);
85 hvx_copy_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
86}
87
88static inline void hvx_copy_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
89 hvx_copy_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
90}
91
92// copy n fp16 elements : source and destination are aligned to HVX Vector (128)
93static inline void hvx_copy_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
94 hvx_copy_aa(dst, src, n, sizeof(__fp16));
95}
96
97// copy n fp16 elements : source is aligned, destination is potentially unaligned
98static inline void hvx_copy_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
99 hvx_copy_au(dst, src, n, sizeof(__fp16));
100}
101
102// copy n fp16 elements : source is aligned, destination is potentially unaligned
103static inline void hvx_copy_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
104 hvx_copy_ua(dst, src, n, sizeof(__fp16));
105}
106
107// copy n fp16 elements : source is aligned, destination is potentially unaligned
108static inline void hvx_copy_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
109 hvx_copy_uu(dst, src, n, sizeof(__fp16));
110}
111
112// copy n fp32 elements : source and destination are aligned to HVX Vector (128)
113static inline void hvx_copy_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
114 hvx_copy_aa(dst, src, n, sizeof(float));
115}
116
117// copy n fp32 elements : source is aligned, destination is unaligned
118static inline void hvx_copy_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
119 hvx_copy_ua(dst, src, n, sizeof(float));
120}
121
122// copy n fp32 elements : source is unaligned, destination is aligned
123static inline void hvx_copy_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
124 hvx_copy_au(dst, src, n, sizeof(float));
125}
126
127// copy n fp32 elements : source is unaligned, destination unaligned
128static inline void hvx_copy_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
129 hvx_copy_uu(dst, src, n, sizeof(float));
130}
131
132//// fp32 -> fp16
133
134#define hvx_copy_f16_f32_loop_body(dst_type, src_type, vec_store) \
135 do { \
136 dst_type * restrict vdst = (dst_type *) dst; \
137 src_type * restrict vsrc = (src_type *) src; \
138 \
139 const uint32_t elem_size = sizeof(__fp16); \
140 const uint32_t epv = 128 / elem_size; \
141 const uint32_t nvec = n / epv; \
142 const uint32_t nloe = n % epv; \
143 \
144 uint32_t i = 0; \
145 \
146 _Pragma("unroll(4)") \
147 for (; i < nvec; i++) { \
148 vdst[i] = hvx_vec_f32_to_f16(vsrc[i*2+0], vsrc[i*2+1]); \
149 } \
150 if (nloe) { \
151 HVX_Vector v = hvx_vec_f32_to_f16(vsrc[i*2+0], vsrc[i*2+1]); \
152 vec_store((void *) &vdst[i], nloe * elem_size, v); \
153 } \
154 } while(0)
155
156// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is aligned
157static inline void hvx_copy_f16_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
158 assert((unsigned long) dst % 128 == 0);
159 assert((unsigned long) src % 128 == 0);
160 hvx_copy_f16_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
161}
162
163// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is aligned
164static inline void hvx_copy_f16_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
165 assert((unsigned long) dst % 128 == 0);
166 hvx_copy_f16_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
167}
168
169// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is unaligned
170static inline void hvx_copy_f16_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
171 assert((unsigned long) src % 128 == 0);
172 hvx_copy_f16_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
173}
174
175// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is unaligned
176static inline void hvx_copy_f16_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
177 hvx_copy_f16_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
178}
179
180//// fp16 -> fp32
181
182#define hvx_copy_f32_f16_loop_body(dst_type, src_type, vec_store) \
183 do { \
184 dst_type * restrict vdst = (dst_type *) dst; \
185 src_type * restrict vsrc = (src_type *) src; \
186 \
187 const HVX_Vector one = hvx_vec_splat_f16(1.0); \
188 \
189 const uint32_t elem_size = sizeof(__fp16); \
190 const uint32_t epv = 128 / elem_size; \
191 const uint32_t nvec = n / epv; \
192 uint32_t nloe = n % epv; \
193 \
194 uint32_t i = 0; \
195 \
196 _Pragma("unroll(4)") \
197 for (i = 0; i < nvec; ++i) { \
198 HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vsrc[i]), one); \
199 vdst[i*2] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(p)); \
200 vdst[i*2+1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(p)); \
201 } \
202 \
203 if (nloe) { \
204 HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vsrc[i]), one); \
205 \
206 HVX_Vector vd = Q6_V_lo_W(p); \
207 i = 2 * i; \
208 \
209 if (nloe >= 32) { \
210 vdst[i] = Q6_Vsf_equals_Vqf32(vd); \
211 nloe -= 32; ++i; vd = Q6_V_hi_W(p); \
212 } \
213 \
214 if (nloe) { \
215 vd = Q6_Vsf_equals_Vqf32(vd); \
216 hvx_vec_store_u(&vdst[i], nloe * sizeof(float), vd); \
217 } \
218 } \
219 } while(0)
220
221// copy/convert n fp16 elements into n fp32 elements : source is aligned, destination is aligned
222static inline void hvx_copy_f32_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
223 assert((unsigned long) dst % 128 == 0);
224 assert((unsigned long) src % 128 == 0);
225 hvx_copy_f32_f16_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
226}
227
228// copy/convert n fp16 elements into n fp32 elements : source is unaligned, destination is aligned
229static inline void hvx_copy_f32_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
230 assert((unsigned long) dst % 128 == 0);
231 hvx_copy_f32_f16_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
232}
233
234// copy/convert n fp16 elements into n fp32 elements : source is aligned, destination is unaligned
235static inline void hvx_copy_f32_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
236 assert((unsigned long) src % 128 == 0);
237 hvx_copy_f32_f16_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
238}
239
240// copy/convert n fp16 elements into n fp32 elements : source is unaligned, destination is unaligned
241static inline void hvx_copy_f32_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
242 hvx_copy_f32_f16_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
243}
244
245#endif // HVX_COPY_H