summaryrefslogtreecommitdiff
path: root/llama.cpp/ggml/src/ggml-sycl/quants.hpp
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/ggml/src/ggml-sycl/quants.hpp
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/ggml/src/ggml-sycl/quants.hpp')
-rw-r--r--llama.cpp/ggml/src/ggml-sycl/quants.hpp110
1 files changed, 110 insertions, 0 deletions
diff --git a/llama.cpp/ggml/src/ggml-sycl/quants.hpp b/llama.cpp/ggml/src/ggml-sycl/quants.hpp
new file mode 100644
index 0000000..d0d5ac9
--- /dev/null
+++ b/llama.cpp/ggml/src/ggml-sycl/quants.hpp
@@ -0,0 +1,110 @@
+//
+// MIT license
+// Copyright (C) 2025 Codeplay Software Ltd.
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_QUANTS_HPP
+#define GGML_SYCL_QUANTS_HPP
+
+#include <utility>
+
+#include "ggml-common.h"
+#include "ggml.h"
+
+namespace ggml_sycl_reordered {
+
+// The reordered block moves quants (qs) and scales(d) to two
+// uniform regions of memory that is contiguous in the same tensor.
+// What this means is that instead of having:
+// [d0, qs0] [d1, qs1] [d2, qs2] ... [dN, qsN]
+// We have:
+// [qs0, qs1, qs2, ..., qsN] [d0, d1, d2, ..., dN]
+//
+// Notes: out-of-bounds qs will run into d values
+// Aligment relies on the allocated size of qs
+
+template <ggml_type type> struct block_q_t;
+
+// qk number of weights / quants in a block
+// qr number of weights in a byte (described as 'before dequantization')
+// for quantization types that has low and high bits split, qr is calculated with
+// using the lower bits, e.g for Q6 quants QR6 is 2
+// qi number of 32 bit integers needed to represent all the quants from a block (`qs` field)
+// See ggml-common.h to see how these are calculated
+template <> struct block_q_t<GGML_TYPE_Q4_0> {
+ struct traits {
+ static constexpr uint32_t qk = QK4_0;
+ static constexpr uint32_t qi = QI4_0;
+ static constexpr uint32_t qr = QR4_0;
+ static constexpr uint32_t vdr_mmvq = 2;
+ };
+
+ static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
+ return { block_index * (QK4_0 / QR4_0), 0 };
+ }
+
+ static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
+ return { (ncols / QR4_0 * nrows) + block_index * sizeof(ggml_half), 0 };
+ }
+
+ static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
+};
+
+template <> struct block_q_t<GGML_TYPE_Q4_K> {
+ struct traits {
+ static constexpr uint32_t qk = QK_K;
+ static constexpr uint32_t qi = QI4_K;
+ static constexpr uint32_t qr = QR4_K;
+ static constexpr uint32_t vdr_mmvq = 2;
+ };
+
+ static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
+ return { block_index * (traits::qk / traits::qr), 0 };
+ }
+
+ static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
+ auto nblocks = (nrows * (ncols / QK_K));
+ return { nblocks * (QK_K / 2) + (block_index * K_SCALE_SIZE),
+ (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
+ }
+
+ static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
+};
+
+template <> struct block_q_t<GGML_TYPE_Q6_K> {
+ struct traits {
+ static constexpr uint32_t qk = QK_K;
+ static constexpr uint32_t qi = QI6_K;
+ static constexpr uint32_t qr = QR6_K;
+ static constexpr uint32_t vdr_mmvq = 1;
+ };
+
+ static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
+ auto low_bits_index = block_index * (QK_K / QR6_K);
+ // the index of high bits it's after all low bits
+ auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
+ return { low_bits_index, high_bits_index };
+ }
+
+ static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
+ auto nblocks = (nrows * (ncols / QK_K));
+ auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
+ auto block_scales = total_qs_bytes + block_index * (QK_K / 16);
+ auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16) + block_index * sizeof(ggml_half);
+ return { block_scales, sb_scale };
+ }
+
+ static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
+};
+
+} // namespace ggml_sycl_reordered
+
+#endif // GGML_SYCL_QUANTS_HPP