1#include "set_rows.hpp"
2#include "cpy.hpp"
3
4namespace utils {
5template<typename T>
6static constexpr bool is_arithmetic_v() {
7 return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half> || std::is_same_v<T, sycl::ext::oneapi::bfloat16>;
8}
9}
10
11template<typename TIn, typename TOut>
12static inline std::enable_if_t<utils::is_arithmetic_v<TIn>() && utils::is_arithmetic_v<TOut>(), void>
13convert (const char* src, char* dst) {
14 auto src_val = *reinterpret_cast<const TIn*>(src);
15 auto dst_val = sycl::vec<TIn, 1>(src_val).template convert<TOut, sycl::rounding_mode::automatic>()[0];
16 *reinterpret_cast<TOut*>(dst) = dst_val;
17}
18
19template <typename TIdx, typename blockType, int qk, cpy_kernel_t cpyblck>
20static void set_rows_sycl_q(const char * __restrict__ src0_d,
21 const TIdx * __restrict__ src1_d,
22 blockType * __restrict__ dst_d,
23 // tensor dimensions src0 and src1
24 const int64_t ne00,
25 const int64_t ne01,
26 const int64_t ne02,
27 const int64_t ne03,
28 const int64_t ne10,
29 const int64_t ne11,
30 const int64_t ne12,
31 const int64_t ne13,
32 // strides for src0
33 const size_t nb00,
34 const size_t nb01,
35 const size_t nb02,
36 const size_t nb03,
37 // strides for src1
38 const size_t nb10,
39 const size_t nb11,
40 const size_t nb12,
41 const size_t nb13,
42 // strides for dst
43 const size_t nb1,
44 const size_t nb2,
45 const size_t nb3,
46 queue_ptr stream) {
47 const int64_t total_blocks = (ne00 * ne01 * ne02 * ne03) / qk;
48 constexpr int block_size = 256;
49 const int64_t grid_size = ceil_div(total_blocks, block_size);
50
51 stream->parallel_for(sycl::nd_range<1>(grid_size * block_size, block_size), [=](sycl::nd_item<1> item_ct1) {
52 const int64_t i = item_ct1.get_global_linear_id();
53 if (i >= total_blocks) {
54 return;
55 }
56 const int64_t i_base = i * qk;
57 const int64_t i03 = i_base / (ne00 * ne01 * ne02);
58 const int64_t rem1 = i_base - i03 * (ne00 * ne01 * ne02);
59 const int64_t i02 = rem1 / (ne00 * ne01);
60 const int64_t rem2 = rem1 - i02 * ne00 * ne01;
61 const int64_t i01 = rem2 / ne00;
62 const int64_t i00 = rem2 - i01 * ne00;
63 const int64_t i12 = i03 % ne12;
64 const int64_t i11 = i02 % ne11;
65 const int64_t i10 = i01;
66 const size_t src_offset = calculate_offset<3>({ nb01, nb02, nb03 }, { i01, i02, i03 });
67 const char * src_block = src0_d + src_offset + i00 * sizeof(float);
68 const size_t src1_offset = calculate_offset<3>({ nb10, nb11, nb12 }, { i10, i11, i12 });
69 const int64_t dst_row = src1_d[src1_offset / sizeof(TIdx)];
70 const size_t dst_offset =
71 calculate_offset<3>({ nb1, nb2, nb3 }, { dst_row, i02, i03 }) + (i00 / qk) * sizeof(blockType);
72 char * dst_block = reinterpret_cast<char *>(reinterpret_cast<char *>(dst_d) + dst_offset);
73 cpyblck(src_block, dst_block);
74 });
75 GGML_UNUSED(ne10);
76 GGML_UNUSED(ne13);
77 GGML_UNUSED(nb00);
78 GGML_UNUSED(nb13);
79}
80
81template<typename TIn, typename TIdx, typename TOut>
82static void k_set_rows(
83 const char * __restrict__ src0, const TIdx * __restrict__ src1, char * __restrict__ dst,
84 const int64_t ne00, const int64_t ne01, const int64_t ne02,
85 const int64_t ne11, const int64_t ne12,
86 const size_t nb01, const size_t nb02, const size_t nb03,
87 const size_t nb10, const size_t nb11, const size_t nb12,
88 const size_t nb1, const size_t nb2, const size_t nb3,
89 const size_t src_type_size, const size_t dst_type_size,
90 const int64_t total_elements,
91 const sycl::nd_item<1> & item_ct1) {
92
93 const int64_t i = item_ct1.get_global_linear_id();
94 if (i >= total_elements) {
95 return;
96 }
97
98 const int64_t i03 = i / (ne00 * ne01 * ne02);
99 const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
100 const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
101 const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
102
103 const int64_t i12 = i03 % ne12;
104 const int64_t i11 = i02 % ne11;
105 const int64_t i10 = i01;
106
107 const int64_t dst_row = *(const TIdx *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12}));
108
109 const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03});
110 const char * src_elem = src0_row + i00 * src_type_size;
111 char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
112 char * dst_elem = dst_row_ptr + i00 * dst_type_size;
113
114 convert<TIn, TOut>(src_elem, dst_elem);
115}
116
117template<typename TIn, typename TIdx, typename TOut>
118static void set_rows_sycl(
119 const char * src0_d, const TIdx * src1_d, char * dst_d,
120 const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
121 const int64_t ne11, const int64_t ne12, const size_t nb01, const size_t nb02, const size_t nb03,
122 const size_t nb10, const size_t nb11, const size_t nb12,
123 const size_t nb1, const size_t nb2, const size_t nb3,
124 const size_t src_type_size, const size_t dst_type_size,
125 queue_ptr stream) {
126
127 const int64_t total_elements = ne00 * ne01 * ne02 * ne03;
128
129 constexpr int block_size = 64;
130 const int64_t grid_size = ceil_div(total_elements, block_size);
131
132 stream->parallel_for(
133 sycl::nd_range<1>(grid_size * block_size, block_size),
134 [=](sycl::nd_item<1> item_ct1) {
135 k_set_rows<TIn, TIdx, TOut>(
136 src0_d, src1_d, dst_d,
137 ne00, ne01, ne02,
138 ne11, ne12,
139 nb01, nb02, nb03,
140 nb10, nb11, nb12,
141 nb1, nb2, nb3,
142 src_type_size, dst_type_size,
143 total_elements,
144 item_ct1
145 );
146 }
147 );
148}
149
150template<typename TIn, typename TIdx>
151static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
152 const char * src0_d = (const char *)src0->data;
153 const TIdx * src1_d = (const TIdx *)src1->data;
154
155 GGML_TENSOR_BINARY_OP_LOCALS
156
157 dpct::queue_ptr stream = ctx.stream();
158 switch (dst->type) {
159 case GGML_TYPE_F32:
160 set_rows_sycl<TIn, TIdx, float>(
161 src0_d, src1_d, (char *)dst->data,
162 ne00, ne01, ne02, ne03,
163 ne11, ne12,
164 nb01, nb02, nb03,
165 nb10, nb11, nb12,
166 nb1, nb2, nb3,
167 sizeof(TIn), sizeof(float),
168 stream
169 );
170 break;
171 case GGML_TYPE_F16:
172 dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
173 set_rows_sycl<TIn, TIdx, sycl::half>(
174 src0_d, src1_d, (char *)dst->data,
175 ne00, ne01, ne02, ne03,
176 ne11, ne12,
177 nb01, nb02, nb03,
178 nb10, nb11, nb12,
179 nb1, nb2, nb3,
180 sizeof(TIn), sizeof(sycl::half),
181 stream
182 );
183 break;
184 case GGML_TYPE_BF16:
185 set_rows_sycl<TIn, TIdx, sycl::ext::oneapi::bfloat16>(
186 src0_d, src1_d, (char *)dst->data,
187 ne00, ne01, ne02, ne03,
188 ne11, ne12,
189 nb01, nb02, nb03,
190 nb10, nb11, nb12,
191 nb1, nb2, nb3,
192 sizeof(TIn), sizeof(sycl::ext::oneapi::bfloat16),
193 stream
194 );
195 break;
196 case GGML_TYPE_Q8_0:
197 set_rows_sycl_q<TIdx, block_q8_0, QK8_0, cpy_blck_f32_q8_0>(src0_d, src1_d, (block_q8_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
198 break;
199 case GGML_TYPE_Q5_1:
200 set_rows_sycl_q<TIdx, block_q5_1, QK5_1, cpy_blck_f32_q5_1>(src0_d, src1_d, (block_q5_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
201 break;
202 case GGML_TYPE_Q5_0:
203 set_rows_sycl_q<TIdx, block_q5_0, QK5_0, cpy_blck_f32_q5_0>(src0_d, src1_d, (block_q5_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
204 break;
205 case GGML_TYPE_Q4_1:
206 set_rows_sycl_q<TIdx, block_q4_1, QK4_1, cpy_blck_f32_q4_1>(src0_d, src1_d, (block_q4_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
207 break;
208 case GGML_TYPE_Q4_0:
209 set_rows_sycl_q<TIdx, block_q4_0, QK4_0, cpy_blck_f32_q4_0>(src0_d, src1_d, (block_q4_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
210 break;
211 case GGML_TYPE_IQ4_NL:
212 set_rows_sycl_q<TIdx, block_iq4_nl, QK4_NL, cpy_blck_f32_iq4_nl>(src0_d, src1_d, (block_iq4_nl *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream);
213 break;
214
215 default:
216 GGML_ABORT("Unsupported tensor type!");
217 break;
218 }
219}
220
221void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
222 scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
223 const ggml_tensor * src0 = dst->src[0];
224 const ggml_tensor * src1 = dst->src[1];
225
226 GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
227 GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I64 || dst->src[1]->type == GGML_TYPE_I32);
228
229 if (src1->type == GGML_TYPE_I64) {
230 set_rows_sycl<float, int64_t>(ctx, src0, src1, dst);
231 } else {
232 set_rows_sycl<float, int32_t>(ctx, src0, src1, dst);
233 }
234}