llama.cpp/ggml/src/ggml-sycl/quants.hpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110

//
// MIT license
// Copyright (C) 2025 Codeplay Software Ltd.
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: MIT
//

//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//

#ifndef GGML_SYCL_QUANTS_HPP
#define GGML_SYCL_QUANTS_HPP

#include <utility>

#include "ggml-common.h"
#include "ggml.h"

namespace ggml_sycl_reordered {

// The reordered block moves quants (qs) and  scales(d) to two
// uniform regions of memory that is contiguous in the same tensor.
// What this means is that instead of having:
// [d0, qs0] [d1, qs1] [d2, qs2] ... [dN, qsN]
// We have:
// [qs0, qs1, qs2, ..., qsN]  [d0, d1, d2, ..., dN]
//
// Notes: out-of-bounds qs will run into d values
// Aligment relies on the allocated size of qs

template <ggml_type type> struct block_q_t;

// qk number of weights / quants in a block
// qr number of weights in a byte (described as 'before dequantization')
//    for quantization types that has low and high bits split, qr is calculated with
//    using the lower bits, e.g for Q6 quants QR6 is 2
// qi number of 32 bit integers needed to represent all the quants from a block (`qs` field)
// See ggml-common.h to see how these are calculated
template <> struct block_q_t<GGML_TYPE_Q4_0> {
    struct traits {
        static constexpr uint32_t qk       = QK4_0;
        static constexpr uint32_t qi       = QI4_0;
        static constexpr uint32_t qr       = QR4_0;
        static constexpr uint32_t vdr_mmvq = 2;
    };

    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
        return { block_index * (QK4_0 / QR4_0), 0 };
    }

    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
        return { (ncols / QR4_0 * nrows) + block_index * sizeof(ggml_half), 0 };
    }

    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
};

template <> struct block_q_t<GGML_TYPE_Q4_K> {
    struct traits {
        static constexpr uint32_t qk       = QK_K;
        static constexpr uint32_t qi       = QI4_K;
        static constexpr uint32_t qr       = QR4_K;
        static constexpr uint32_t vdr_mmvq = 2;
    };

    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
        return { block_index * (traits::qk / traits::qr), 0 };
    }

    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
        auto nblocks = (nrows * (ncols / QK_K));
        return { nblocks * (QK_K / 2) + (block_index * K_SCALE_SIZE),
                 (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
    }

    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
};

template <> struct block_q_t<GGML_TYPE_Q6_K> {
    struct traits {
        static constexpr uint32_t qk       = QK_K;
        static constexpr uint32_t qi       = QI6_K;
        static constexpr uint32_t qr       = QR6_K;
        static constexpr uint32_t vdr_mmvq = 1;
    };

    static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
        auto low_bits_index  = block_index * (QK_K / QR6_K);
        // the index of high bits it's after all low bits
        auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
        return { low_bits_index, high_bits_index };
    }

    static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
        auto nblocks        = (nrows * (ncols / QK_K));
        auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
        auto block_scales   = total_qs_bytes + block_index * (QK_K / 16);
        auto sb_scale       = total_qs_bytes + nblocks * (QK_K / 16) + block_index * sizeof(ggml_half);
        return { block_scales, sb_scale };
    }

    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
};

}  // namespace ggml_sycl_reordered

#endif  // GGML_SYCL_QUANTS_HPP