1#ifndef HVX_COPY_H
  2#define HVX_COPY_H
  3
  4#include <assert.h>
  5#include <stddef.h>
  6#include <stdint.h>
  7
  8#include "hvx-base.h"
  9
 10#define hvx_splat_loop_body(dst_type, vec_store)                 \
 11    do {                                                         \
 12        dst_type * restrict vdst = (dst_type *) dst;             \
 13                                                                 \
 14        uint32_t nvec = n / (128 / elem_size);                   \
 15        uint32_t nloe = n % (128 / elem_size);                   \
 16                                                                 \
 17        uint32_t i = 0;                                          \
 18                                                                 \
 19        _Pragma("unroll(4)")                                     \
 20        for (; i < nvec; i++) {                                  \
 21            vdst[i] = src;                                       \
 22        }                                                        \
 23        if (nloe) {                                              \
 24            vec_store((void *) &vdst[i], nloe * elem_size, src); \
 25        }                                                        \
 26    } while(0)
 27
 28static inline void hvx_splat_a(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
 29    assert((unsigned long) dst % 128 == 0);
 30    hvx_splat_loop_body(HVX_Vector, hvx_vec_store_a);
 31}
 32
 33static inline void hvx_splat_u(uint8_t * restrict dst, HVX_Vector src, uint32_t n, uint32_t elem_size) {
 34    hvx_splat_loop_body(HVX_UVector, hvx_vec_store_u);
 35}
 36
 37static inline void hvx_splat_f32_a(uint8_t * restrict dst, float v, uint32_t n) {
 38    hvx_splat_a(dst,  hvx_vec_splat_f32(v), n, sizeof(float));
 39}
 40
 41static inline void hvx_splat_f32_u(uint8_t * restrict dst, float v, uint32_t n) {
 42    hvx_splat_u(dst,  hvx_vec_splat_f32(v), n, sizeof(float));
 43}
 44
 45static inline void hvx_splat_f16_a(uint8_t * restrict dst, float v, uint32_t n) {
 46    hvx_splat_u(dst,  hvx_vec_splat_f16(v), n, sizeof(__fp16));
 47}
 48
 49static inline void hvx_splat_f16_u(uint8_t * restrict dst, float v, uint32_t n) {
 50    hvx_splat_u(dst,  hvx_vec_splat_f16(v), n, sizeof(__fp16));
 51}
 52
 53#define hvx_copy_loop_body(dst_type, src_type, vec_store)            \
 54    do {                                                             \
 55        dst_type * restrict vdst = (dst_type *) dst;                 \
 56        src_type * restrict vsrc = (src_type *) src;                 \
 57                                                                     \
 58        const uint32_t epv  = 128 / elem_size;                       \
 59        const uint32_t nvec = n / epv;                               \
 60        const uint32_t nloe = n % epv;                               \
 61                                                                     \
 62        uint32_t i = 0;                                              \
 63                                                                     \
 64        _Pragma("unroll(4)")                                         \
 65        for (; i < nvec; i++) { vdst[i] = vsrc[i]; }                 \
 66        if (nloe) {                                                  \
 67            vec_store((void *) &vdst[i], nloe * elem_size, vsrc[i]); \
 68        }                                                            \
 69    } while(0)
 70
 71// Generic copy routines
 72static inline void hvx_copy_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
 73    assert((unsigned long) dst % 128 == 0);
 74    assert((unsigned long) src % 128 == 0);
 75    hvx_copy_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
 76}
 77
 78static inline void hvx_copy_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
 79    assert((unsigned long) dst % 128 == 0);
 80    hvx_copy_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
 81}
 82
 83static inline void hvx_copy_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
 84    assert((unsigned long) src % 128 == 0);
 85    hvx_copy_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
 86}
 87
 88static inline void hvx_copy_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n, uint32_t elem_size) {
 89    hvx_copy_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
 90}
 91
 92// copy n fp16 elements : source and destination are aligned to HVX Vector (128)
 93static inline void hvx_copy_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
 94    hvx_copy_aa(dst, src, n, sizeof(__fp16));
 95}
 96
 97// copy n fp16 elements : source is aligned, destination is potentially unaligned
 98static inline void hvx_copy_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
 99    hvx_copy_au(dst, src, n, sizeof(__fp16));
100}
101
102// copy n fp16 elements : source is aligned, destination is potentially unaligned
103static inline void hvx_copy_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
104    hvx_copy_ua(dst, src, n, sizeof(__fp16));
105}
106
107// copy n fp16 elements : source is aligned, destination is potentially unaligned
108static inline void hvx_copy_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
109    hvx_copy_uu(dst, src, n, sizeof(__fp16));
110}
111
112// copy n fp32 elements : source and destination are aligned to HVX Vector (128)
113static inline void hvx_copy_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
114    hvx_copy_aa(dst, src, n, sizeof(float));
115}
116
117// copy n fp32 elements : source is aligned, destination is unaligned
118static inline void hvx_copy_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
119    hvx_copy_ua(dst, src, n, sizeof(float));
120}
121
122// copy n fp32 elements : source is unaligned, destination is aligned
123static inline void hvx_copy_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
124    hvx_copy_au(dst, src, n, sizeof(float));
125}
126
127// copy n fp32 elements : source is unaligned, destination unaligned
128static inline void hvx_copy_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
129    hvx_copy_uu(dst, src, n, sizeof(float));
130}
131
132//// fp32 -> fp16
133
134#define hvx_copy_f16_f32_loop_body(dst_type, src_type, vec_store)                   \
135    do {                                                                            \
136        dst_type * restrict vdst = (dst_type *) dst;                                \
137        src_type * restrict vsrc = (src_type *) src;                                \
138                                                                                    \
139        const uint32_t elem_size = sizeof(__fp16);                                  \
140        const uint32_t epv  = 128 / elem_size;                                      \
141        const uint32_t nvec = n / epv;                                              \
142        const uint32_t nloe = n % epv;                                              \
143                                                                                    \
144        uint32_t i = 0;                                                             \
145                                                                                    \
146        _Pragma("unroll(4)")                                                        \
147        for (; i < nvec; i++) {                                                     \
148            vdst[i] = hvx_vec_f32_to_f16(vsrc[i*2+0], vsrc[i*2+1]);                 \
149        }                                                                           \
150        if (nloe) {                                                                 \
151            HVX_Vector v = hvx_vec_f32_to_f16(vsrc[i*2+0], vsrc[i*2+1]);            \
152            vec_store((void *) &vdst[i], nloe * elem_size, v);                      \
153        }                                                                           \
154    } while(0)
155
156// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is aligned
157static inline void hvx_copy_f16_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
158    assert((unsigned long) dst % 128 == 0);
159    assert((unsigned long) src % 128 == 0);
160    hvx_copy_f16_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
161}
162
163// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is aligned
164static inline void hvx_copy_f16_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
165    assert((unsigned long) dst % 128 == 0);
166    hvx_copy_f16_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
167}
168
169// copy/convert n fp32 elements into n fp16 elements : source is aligned, destination is unaligned
170static inline void hvx_copy_f16_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
171    assert((unsigned long) src % 128 == 0);
172    hvx_copy_f16_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
173}
174
175// copy/convert n fp32 elements into n fp16 elements : source is unaligned, destination is unaligned
176static inline void hvx_copy_f16_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
177    hvx_copy_f16_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
178}
179
180//// fp16 -> fp32
181
182#define hvx_copy_f32_f16_loop_body(dst_type, src_type, vec_store)                   \
183    do {                                                                            \
184        dst_type * restrict vdst = (dst_type *) dst;                                \
185        src_type * restrict vsrc = (src_type *) src;                                \
186                                                                                    \
187        const HVX_Vector one = hvx_vec_splat_f16(1.0);                              \
188                                                                                    \
189        const uint32_t elem_size = sizeof(__fp16);                                  \
190        const uint32_t epv  = 128 / elem_size;                                      \
191        const uint32_t nvec = n / epv;                                              \
192              uint32_t nloe = n % epv;                                              \
193                                                                                    \
194        uint32_t i = 0;                                                             \
195                                                                                    \
196        _Pragma("unroll(4)")                                                        \
197        for (i = 0; i < nvec; ++i) {                                                \
198            HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vsrc[i]), one); \
199            vdst[i*2]   = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(p));                        \
200            vdst[i*2+1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(p));                        \
201        }                                                                           \
202                                                                                    \
203        if (nloe) {                                                                 \
204            HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vsrc[i]), one); \
205                                                                                    \
206            HVX_Vector vd = Q6_V_lo_W(p);                                           \
207            i = 2 * i;                                                              \
208                                                                                    \
209            if (nloe >= 32) {                                                       \
210                vdst[i] = Q6_Vsf_equals_Vqf32(vd);                                  \
211                nloe -= 32; ++i; vd = Q6_V_hi_W(p);                                 \
212            }                                                                       \
213                                                                                    \
214            if (nloe) {                                                             \
215                vd = Q6_Vsf_equals_Vqf32(vd);                                       \
216                hvx_vec_store_u(&vdst[i], nloe * sizeof(float), vd);                \
217            }                                                                       \
218        }                                                                           \
219    } while(0)
220
221// copy/convert n fp16 elements into n fp32 elements : source is aligned, destination is aligned
222static inline void hvx_copy_f32_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
223    assert((unsigned long) dst % 128 == 0);
224    assert((unsigned long) src % 128 == 0);
225    hvx_copy_f32_f16_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
226}
227
228// copy/convert n fp16 elements into n fp32 elements : source is unaligned, destination is aligned
229static inline void hvx_copy_f32_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
230    assert((unsigned long) dst % 128 == 0);
231    hvx_copy_f32_f16_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
232}
233
234// copy/convert n fp16 elements into n fp32 elements : source is aligned, destination is unaligned
235static inline void hvx_copy_f32_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
236    assert((unsigned long) src % 128 == 0);
237    hvx_copy_f32_f16_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
238}
239
240// copy/convert n fp16 elements into n fp32 elements : source is unaligned, destination is unaligned
241static inline void hvx_copy_f32_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) {
242    hvx_copy_f32_f16_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
243}
244
245#endif // HVX_COPY_H