1#ifndef HVX_SCALE_H
  2#define HVX_SCALE_H
  3
  4#include <assert.h>
  5#include <stddef.h>
  6#include <stdint.h>
  7
  8#include "hvx-base.h"
  9
 10#define hvx_scale_f32_loop_body(dst_type, src_type, vec_store)                       \
 11    do {                                                                             \
 12        dst_type * restrict vdst = (dst_type *) dst;                                 \
 13        src_type * restrict vsrc = (src_type *) src;                                 \
 14                                                                                     \
 15        HVX_Vector vs = hvx_vec_splat_f32(scale);                                    \
 16                                                                                     \
 17        const uint32_t elem_size = sizeof(float);                                    \
 18        const uint32_t epv = 128 / elem_size;                                        \
 19        const uint32_t nvec = n / epv;                                               \
 20        const uint32_t nloe = n % epv;                                               \
 21                                                                                     \
 22        uint32_t i = 0;                                                              \
 23                                                                                     \
 24        _Pragma("unroll(4)")                                                         \
 25        for (; i < nvec; ++i) {                                                      \
 26            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);                        \
 27            vdst[i]      = Q6_Vsf_equals_Vqf32(v);                                   \
 28        }                                                                            \
 29        if (nloe) {                                                                  \
 30            HVX_Vector v = Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs);                        \
 31            vec_store((void *) &vdst[i], nloe * elem_size, Q6_Vsf_equals_Vqf32(v));  \
 32        }                                                                            \
 33    } while(0)
 34
 35static inline void hvx_scale_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
 36    assert((size_t) dst % 128 == 0);
 37    assert((size_t) src % 128 == 0);
 38    hvx_scale_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
 39}
 40
 41static inline void hvx_scale_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
 42    assert((size_t) dst % 128 == 0);
 43    hvx_scale_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
 44}
 45
 46static inline void hvx_scale_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
 47    assert((size_t) src % 128 == 0);
 48    hvx_scale_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
 49}
 50
 51static inline void hvx_scale_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
 52    hvx_scale_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
 53}
 54
 55static inline void hvx_scale_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale) {
 56    if (((size_t) dst & 127) == 0) {
 57        if (((size_t) src & 127) == 0) {
 58            hvx_scale_f32_aa(dst, src, n, scale);
 59        } else {
 60            hvx_scale_f32_au(dst, src, n, scale);
 61        }
 62    } else {
 63        if (((size_t) src & 127) == 0) {
 64            hvx_scale_f32_ua(dst, src, n, scale);
 65        } else {
 66            hvx_scale_f32_uu(dst, src, n, scale);
 67        }
 68    }
 69}
 70
 71#define hvx_scale_offset_f32_loop_body(dst_type, src_type, vec_store)                \
 72    do {                                                                             \
 73        dst_type * restrict vdst = (dst_type *) dst;                                 \
 74        src_type * restrict vsrc = (src_type *) src;                                 \
 75                                                                                     \
 76        HVX_Vector vs = hvx_vec_splat_f32(scale);                                    \
 77        HVX_Vector vo = hvx_vec_splat_f32(offset);                                   \
 78                                                                                     \
 79        const uint32_t elem_size = sizeof(float);                                    \
 80        const uint32_t epv = 128 / elem_size;                                        \
 81        const uint32_t nvec = n / epv;                                               \
 82        const uint32_t nloe = n % epv;                                               \
 83                                                                                     \
 84        uint32_t i = 0;                                                              \
 85                                                                                     \
 86        _Pragma("unroll(4)")                                                         \
 87        for (; i < nvec; ++i) {                                                      \
 88            HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); \
 89            vdst[i] = Q6_Vsf_equals_Vqf32(v);                                        \
 90        }                                                                            \
 91        if (nloe) {                                                                  \
 92            HVX_Vector v = Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(vsrc[i], vs), vo); \
 93            vec_store((void *) &vdst[i], nloe * elem_size, Q6_Vsf_equals_Vqf32(v));  \
 94        }                                                                            \
 95    } while(0)
 96
 97static inline void hvx_scale_offset_f32_aa(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
 98    assert((size_t) dst % 128 == 0);
 99    assert((size_t) src % 128 == 0);
100    hvx_scale_offset_f32_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
101}
102
103static inline void hvx_scale_offset_f32_au(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
104    assert((size_t) dst % 128 == 0);
105    hvx_scale_offset_f32_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
106}
107
108static inline void hvx_scale_offset_f32_ua(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
109    assert((size_t) src % 128 == 0);
110    hvx_scale_offset_f32_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
111}
112
113static inline void hvx_scale_offset_f32_uu(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
114    hvx_scale_offset_f32_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
115}
116
117static inline void hvx_scale_offset_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int n, const float scale, const float offset) {
118    if (((size_t) dst & 127) == 0) {
119        if (((size_t) src & 127) == 0) {
120            hvx_scale_offset_f32_aa(dst, src, n, scale, offset);
121        } else {
122            hvx_scale_offset_f32_au(dst, src, n, scale, offset);
123        }
124    } else {
125        if (((size_t) src & 127) == 0) {
126            hvx_scale_offset_f32_ua(dst, src, n, scale, offset);
127        } else {
128            hvx_scale_offset_f32_uu(dst, src, n, scale, offset);
129        }
130    }
131}
132
133#endif // HVX_SCALE_H