1//
2// MIT license
3// Copyright (C) 2024 Intel Corporation
4// SPDX-License-Identifier: MIT
5//
6
7//
8// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9// See https://llvm.org/LICENSE.txt for license information.
10// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11//
12
13#ifndef GGML_SYCL_COMMON_HPP
14#define GGML_SYCL_COMMON_HPP
15
16#include <cstddef>
17#include <fstream>
18#include <iostream>
19#include <string>
20
21#include "dpct/helper.hpp"
22#include "ggml-sycl.h"
23#include "presets.hpp"
24#include "sycl_hw.hpp"
25
26
27#if GGML_SYCL_DNNL
28#include "dnnl.hpp"
29#include "dnnl_sycl.hpp"
30#endif
31
32#define GGML_COMMON_DECL_SYCL
33#define GGML_COMMON_IMPL_SYCL
34/* suppress warning spam */
35#pragma clang diagnostic push
36#pragma clang diagnostic ignored "-Wnested-anon-types"
37#include "ggml-common.h"
38#pragma clang diagnostic pop
39#include "ggml-impl.h"
40
41void* ggml_sycl_host_malloc(size_t size);
42void ggml_sycl_host_free(void* ptr);
43
44
45extern int g_ggml_sycl_debug;
46extern int g_ggml_sycl_disable_optimize;
47extern int g_ggml_sycl_prioritize_dmmv;
48
49#if defined(__clang__) && __has_builtin(__builtin_expect)
50// Hint the optimizer to pipeline the more likely following instruction in branches
51# define LIKELY(expr) __builtin_expect(expr, true)
52# define UNLIKELY(expr) __builtin_expect(expr, false)
53#else
54# define LIKELY(expr) (expr)
55# define UNLIKELY(expr) (expr)
56#endif
57
58#define GGML_SYCL_DEBUG(...) \
59 do { \
60 if (UNLIKELY(g_ggml_sycl_debug)) \
61 fprintf(stderr, __VA_ARGS__); \
62 } while (0)
63
64#define CHECK_TRY_ERROR(expr) \
65 [&]() { \
66 try { \
67 expr; \
68 return dpct::success; \
69 } catch (std::exception const& e) { \
70 std::cerr << e.what() << "\nException caught at file:" << __FILE__ \
71 << ", line:" << __LINE__ << ", func:" << __func__ \
72 << std::endl; \
73 return dpct::default_error; \
74 } \
75 }()
76
77
78#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
79#define VER_4VEC 610 // todo for hardward optimize.
80#define VER_GEN9 700 // todo for hardward optimize.
81#define VER_GEN12 1000000 // todo for hardward optimize.
82#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
83
84#define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
85
86// define for XMX in Intel GPU
87// TODO: currently, it's not used for XMX really.
88#if !defined(GGML_SYCL_FORCE_MMQ)
89 #define SYCL_USE_XMX
90#endif
91
92// max batch size to use MMQ kernels when tensor cores are available
93#define MMQ_MAX_BATCH_SIZE 32
94
95// dmmv = dequantize_mul_mat_vec
96#ifndef GGML_SYCL_DMMV_X
97#define GGML_SYCL_DMMV_X 32
98#endif
99#ifndef GGML_SYCL_MMV_Y
100#define GGML_SYCL_MMV_Y 1
101#endif
102
103typedef sycl::queue *queue_ptr;
104
105enum ggml_sycl_backend_gpu_mode {
106 SYCL_UNSET_GPU_MODE = -1,
107 SYCL_SINGLE_GPU_MODE = 0,
108 SYCL_MUL_GPU_MODE
109};
110
111static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
112
113static void crash() {
114 int* ptr = NULL;
115 *ptr = 0;
116}
117
118[[noreturn]] static void ggml_sycl_error(
119 const char* stmt,
120 const char* func,
121 const char* file,
122 const int line,
123 const char* msg) {
124 fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
125 fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
126 GGML_ABORT("SYCL error");
127}
128
129#define SYCL_CHECK(err) \
130 do { \
131 auto err_ = (err); \
132 if (err_ != 0) \
133 ggml_sycl_error(#err, __func__, __FILE__, __LINE__, "Exception caught in this line of code."); \
134 } while (0)
135
136#if DPCT_COMPAT_RT_VERSION >= 11100
137#define GGML_SYCL_ASSUME(x) __builtin_assume(x)
138#else
139#define GGML_SYCL_ASSUME(x)
140#endif // DPCT_COMPAT_RT_VERSION >= 11100
141
142#ifdef GGML_SYCL_F16
143typedef sycl::half dfloat; // dequantize float
144typedef sycl::half2 dfloat2;
145#else
146typedef float dfloat; // dequantize float
147typedef sycl::float2 dfloat2;
148#endif // GGML_SYCL_F16
149
150#define MMVQ_MAX_BATCH_SIZE 8
151
152static int g_all_sycl_device_count = -1;
153static bool g_ggml_backend_sycl_buffer_type_initialized = false;
154
155static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
156 SYCL_UNSET_GPU_MODE;
157
158static void* g_scratch_buffer = nullptr;
159static size_t g_scratch_size = 0; // disabled by default
160static size_t g_scratch_offset = 0;
161
162[[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
163 stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
164 "current GPU architecture.\n";
165 // __trap();
166 std::exit(1);
167
168 (void)bad_arch; // suppress unused function warning
169}
170
171int get_current_device_id();
172
173inline dpct::err0 ggml_sycl_set_device(const int device) try {
174 int current_device_id;
175 SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
176
177 // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
178 // current_device_id=%d\n", device, current_device);
179 if (device == current_device_id) {
180 return 0;
181 }
182
183 return CHECK_TRY_ERROR(dpct::select_device(device));
184} catch (sycl::exception const& exc) {
185 std::cerr << exc.what() << "Exception caught at file:" << __FILE__
186 << ", line:" << __LINE__ << std::endl;
187 crash();
188 std::exit(1);
189}
190
191//////////////////////
192struct optimize_feature {
193 bool reorder=false;
194};
195
196struct sycl_device_info {
197 int cc; // compute capability
198 int nsm; // number of streaming multiprocessors (CUDA) maps to the maximum
199 // number of compute units on a SYCL device.
200 // size_t smpb; // max. shared memory per block
201 size_t smpbo; // max. shared memory per block (with opt-in)
202 bool vmm; // virtual memory support
203 size_t total_vram;
204 //sycl_hw_info hw_info; \\ device id and aarch, currently not used
205 optimize_feature opt_feature;
206};
207
208
209struct ggml_sycl_device_info {
210 int device_count;
211
212 sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
213
214 std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
215
216 int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0};
217};
218
219const ggml_sycl_device_info & ggml_sycl_info();
220
221struct ggml_sycl_pool {
222 virtual ~ggml_sycl_pool() = default;
223
224 virtual void * alloc(size_t size, size_t * actual_size) = 0;
225 virtual void free(void * ptr, size_t size) = 0;
226};
227
228template<typename T>
229struct ggml_sycl_pool_alloc {
230 ggml_sycl_pool * pool = nullptr;
231 T * ptr = nullptr;
232 size_t actual_size = 0;
233
234 explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) {
235 }
236
237 ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) {
238 alloc(size);
239 }
240
241 ~ggml_sycl_pool_alloc() {
242 if (ptr != nullptr) {
243 pool->free(ptr, actual_size);
244 }
245 }
246
247 T * realloc(size_t size) {
248 GGML_ASSERT(pool != nullptr);
249 if (ptr)
250 pool->free(ptr, actual_size);
251 ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
252 return ptr;
253 }
254
255 // size is in number of elements
256 T * alloc(size_t size) {
257 GGML_ASSERT(pool != nullptr);
258 GGML_ASSERT(ptr == nullptr);
259 ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
260 return ptr;
261 }
262
263 T * alloc(ggml_sycl_pool & pool, size_t size) {
264 this->pool = &pool;
265 return alloc(size);
266 }
267
268 T * get() {
269 return ptr;
270 }
271
272 ggml_sycl_pool_alloc() = default;
273 ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete;
274 ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete;
275 ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete;
276 ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete;
277};
278
279// backend interface
280
281struct ggml_tensor_extra_gpu {
282 void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
283 // tensors
284 dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
285 [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
286 optimize_feature optimized_feature;
287};
288
289void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
290
291namespace sycl_ex = sycl::ext::oneapi::experimental;
292struct ggml_backend_sycl_context {
293 int device;
294 std::string name;
295 optimize_feature opt_feature;
296
297 queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
298
299 explicit ggml_backend_sycl_context(int device) :
300 device(device),
301 name(GGML_SYCL_NAME + std::to_string(device)) {
302 opt_feature = ggml_sycl_info().devices[device].opt_feature;
303 }
304
305 queue_ptr stream(int device, int stream) {
306 if (qptrs[device][stream] == nullptr) {
307 qptrs[device][stream] = &(dpct::get_device(device).default_queue());
308 }
309 return qptrs[device][stream];
310 }
311
312 queue_ptr stream() {
313 return stream(device, 0);
314 }
315
316#if GGML_SYCL_DNNL
317 dnnl::engine make_engine(sycl::queue* q) {
318 // Get the device associated with the queue
319 sycl::device dev = q->get_device();
320 // Get the context associated with the queue
321 sycl::context ctx = q->get_context();
322 const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
323 return eng;
324 }
325
326 std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
327 std::unordered_map<sycl::queue*, dnnl::engine> engine_map;
328 dnnl::stream stream_dnnl(int device, int _stream) {
329 auto q = stream(device, _stream);
330 return stream_dnnl(q);
331 }
332 dnnl::engine engine_dnnl(sycl::queue* qptr) {
333 auto it = engine_map.find(qptr);
334 if (it == engine_map.end()) {
335 auto eng = make_engine(qptr);
336 engine_map[qptr] = eng;
337 return eng;
338 }
339 else
340 {
341 return it->second;
342 }
343 }
344 dnnl::stream stream_dnnl(sycl::queue* qptr) {
345 auto it = stream_map.find(qptr);
346 if (it == stream_map.end()) {
347 auto eng = engine_dnnl(qptr);
348 auto stream = dnnl::sycl_interop::make_stream(eng, *qptr);
349 stream_map[qptr] = stream;
350 return stream;
351 }
352 else
353 {
354 return it->second;
355 }
356 }
357 dnnl::stream stream_dnnl() {
358 return stream_dnnl(device, 0);
359 }
360 dnnl::memory get_scratchpad_mem(const dnnl::memory::desc & scratchpad_md,
361 const dnnl::engine & eng, const queue_ptr q) {
362 ggml_sycl_pool_alloc<uint8_t> * pool;
363 auto it = scratchpad_map.find(q);
364 if (it == scratchpad_map.end()) {
365 scratchpad_map[q] = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(this->pool());
366 pool = scratchpad_map[q].get();
367 } else {
368 pool = it->second.get();
369 }
370
371 size_t scratchpad_size = scratchpad_md.get_size();
372 if (scratchpad_size > pool->actual_size) {
373 pool->realloc(scratchpad_size);
374 }
375 void * mem_ptr = pool->get();
376 return dnnl::memory(scratchpad_md, eng, mem_ptr);
377 }
378#endif
379
380 // pool
381 std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
382 std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;
383
384 std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
385
386 static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
387
388 static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
389
390 ggml_sycl_pool & pool(int device) {
391 if (pools[device] == nullptr) {
392 pools[device] = new_pool_for_device(stream(device,0), device);
393 }
394 return *pools[device];
395 }
396
397 ggml_sycl_pool & pool() {
398 return pool(device);
399 }
400
401#ifdef GGML_SYCL_GRAPH
402 std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
403#endif
404
405 ggml_sycl_pool & host_pool(int device) {
406 if (host_pools[device] == nullptr) {
407 host_pools[device] = new_pool_for_host(stream(device, 0), device);
408 }
409 return *host_pools[device];
410 }
411
412 ggml_sycl_pool & host_pool() { return host_pool(device); }
413};
414
415// common device functions
416
417static __dpct_inline__ float warp_reduce_sum(float x,
418 const sycl::nd_item<3>& item_ct1) {
419#pragma unroll
420 for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
421 x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
422 }
423 return x;
424}
425
426static __dpct_inline__ sycl::float2
427warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
428#pragma unroll
429 for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
430 a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
431 mask);
432 a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
433 mask);
434 }
435 return a;
436}
437
438template <int width = WARP_SIZE>
439static __dpct_inline__ int warp_reduce_sum(int x) {
440 return sycl::reduce_over_group(
441 sycl::ext::oneapi::this_work_item::get_sub_group(), x, sycl::plus<>());
442}
443
444template <int width = WARP_SIZE>
445static __dpct_inline__ float warp_reduce_sum(float x) {
446#pragma unroll
447 for (int offset = width / 2; offset > 0; offset >>= 1) {
448 x += dpct::permute_sub_group_by_xor(
449 sycl::ext::oneapi::this_work_item::get_sub_group(), x, offset, width);
450 }
451 return x;
452}
453
454template <int width = WARP_SIZE>
455static __dpct_inline__ sycl::float2 warp_reduce_sum(sycl::float2 a) {
456#pragma unroll
457 for (int offset = width / 2; offset > 0; offset >>= 1) {
458 a.x() += dpct::permute_sub_group_by_xor(
459 sycl::ext::oneapi::this_work_item::get_sub_group(), a.x(), offset,
460 width);
461 a.y() += dpct::permute_sub_group_by_xor(
462 sycl::ext::oneapi::this_work_item::get_sub_group(), a.y(), offset,
463 width);
464 }
465 return a;
466}
467
468template <int width = WARP_SIZE>
469static __dpct_inline__ sycl::half2 warp_reduce_sum(sycl::half2 a) {
470#pragma unroll
471 for (int offset = width / 2; offset > 0; offset >>= 1) {
472 a = a + dpct::permute_sub_group_by_xor(
473 sycl::ext::oneapi::this_work_item::get_sub_group(), a, offset,
474 width);
475 }
476 return a;
477}
478
479static constexpr int ggml_sycl_get_physical_warp_size() {
480 // todo: for old iGPU + dGPU case, need to be changed.
481 return WARP_SIZE;
482}
483
484template <int width = WARP_SIZE>
485static __dpct_inline__ float warp_reduce_max(float x) {
486#pragma unroll
487 for (int offset = width / 2; offset > 0; offset >>= 1) {
488 x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
489 sycl::ext::oneapi::this_work_item::get_sub_group(), x,
490 offset, width));
491 }
492 return x;
493}
494
495static __dpct_inline__ float warp_reduce_max(float x,
496 const sycl::nd_item<3>& item_ct1) {
497#pragma unroll
498 for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
499 x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
500 item_ct1.get_sub_group(), x, mask));
501 }
502 return x;
503}
504
505/* Helper for Computing the linear offset of a ggml_tensor given
506per-dimension sizes, strides, and indices */
507template<int N>
508__dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
509 size_t offset = 0;
510#pragma unroll
511 for (int i = 0; i < N; i++) {
512 auto index_i = indices[i];
513 offset += strides[i] * index_i;
514 }
515 return offset;
516}
517
518// Helper for vec loading aligned data
519template <typename Tp, int n>
520inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
521 return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
522}
523
524// Helper for accessing pointers with no warnings
525template <typename Tp, int dim>
526static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
527 return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
528}
529
530int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
531
532constexpr size_t ceil_div(const size_t m, const size_t n) {
533 return (m + n - 1) / n;
534}
535
536bool gpu_has_xmx(sycl::device &dev);
537
538template <int N, class T> std::string debug_get_array_str(const std::string & prefix, const T array[N]) {
539 if (LIKELY(!g_ggml_sycl_debug)) {
540 return "";
541 }
542 std::stringstream ss;
543 ss << prefix << "=[";
544 for (std::size_t i = 0; i < N - 1; ++i) {
545 ss << array[i] << ", ";
546 }
547 if constexpr (N > 0) {
548 ss << array[N - 1];
549 }
550 ss << "]";
551 return ss.str();
552}
553
554inline std::string debug_get_tensor_str(const std::string &prefix,
555 const ggml_tensor *tensor, const std::string &suffix = "") {
556 std::stringstream ss;
557 if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); }
558 ss << prefix.c_str() << "=";
559 if (tensor) {
560 ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type);
561 ss << debug_get_array_str<GGML_MAX_DIMS>(";ne", tensor->ne);
562 ss << debug_get_array_str<GGML_MAX_DIMS>(";nb", tensor->nb);
563
564 if (!ggml_is_contiguous(tensor)) { ss << ";strided"; }
565 if (ggml_is_permuted(tensor)) { ss << ";permuted"; }
566 } else {
567 ss << "nullptr";
568 }
569 ss << suffix;
570 return ss.str();
571}
572
573// Use scope_op_debug_print to log operations coming from running a model
574struct scope_op_debug_print {
575 // Use string_views to avoid the cost of creating a string and concatenating them
576 // string_views must be alive for as long as the object is alive
577 // scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
578 scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
579 std::size_t num_src, const std::string_view & suffix = "") :
580 func(func),
581 func_suffix(func_suffix) {
582 if (LIKELY(!g_ggml_sycl_debug)) {
583 return;
584 }
585 GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
586 GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str());
587 if (dst) {
588 for (std::size_t i = 0; i < num_src; ++i) {
589 GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str());
590 }
591 }
592 GGML_SYCL_DEBUG("%s\n", suffix.data());
593 }
594
595 scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
596 const std::string_view & suffix = "") :
597 scope_op_debug_print(func, "", dst, num_src, suffix) {}
598
599 ~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); }
600
601 private:
602 std::string_view func;
603 std::string_view func_suffix;
604};
605
606static __dpct_inline__ float get_alibi_slope(const float max_bias,
607 const uint32_t h,
608 const uint32_t n_head_log2,
609 const float m0,
610 const float m1) {
611 if (max_bias <= 0.0f) {
612 return 1.0f;
613 }
614 const float base = h < n_head_log2 ? m0 : m1;
615 const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
616
617 return dpct::pow(base, exph);
618}
619
620static const sycl::uint3 init_fastdiv_values(uint32_t d) {
621 GGML_ASSERT(d != 0);
622
623 uint32_t L = 0;
624 while (L < 32 && (uint32_t{ 1 } << L) < d) {
625 L++;
626 }
627
628 uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
629 return sycl::uint3(mp, L, d);
630}
631
632
633static __dpct_inline__ uint32_t fastdiv(uint32_t n, const sycl::uint3 fastdiv_values) {
634 const uint32_t hi = sycl::mul_hi<unsigned>(n, fastdiv_values.x());
635 return (hi + n) >> fastdiv_values.y();
636}
637
638
639static __dpct_inline__ sycl::uint2 fast_div_modulo(uint32_t n, const sycl::uint3 fastdiv_values) {
640 const uint32_t div_val = fastdiv(n, fastdiv_values);
641 const uint32_t mod_val = n - div_val * fastdiv_values.z();
642 return sycl::uint2(div_val, mod_val);
643}
644
645static __dpct_inline__ int ggml_sycl_dp4a(const int a, const int b, int c) {
646 return dpct::dp4a(a, b, c);
647}
648
649static __dpct_inline__ float ggml_sycl_e8m0_to_fp32(uint8_t x) {
650 uint32_t bits;
651 if (x == 0) {
652 bits = 0x00400000;
653 } else {
654 bits = (uint32_t) x << 23;
655 }
656
657 float result;
658 memcpy(&result, &bits, sizeof(float));
659 return result;
660}
661
662
663#endif // GGML_SYCL_COMMON_HPP