llmnpc - llama.cpp/src/llama-model-loader.h

Path: llmnpc / llama.cpp / src / llama-model-loader.h (raw)
  1#pragma once
  2
  3#include "llama.h"
  4
  5#include "llama-impl.h"
  6#include "llama-arch.h"
  7#include "llama-mmap.h"
  8
  9#include "ggml-cpp.h"
 10
 11#include <cstddef>
 12#include <map>
 13#include <stdexcept>
 14#include <unordered_map>
 15
 16using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
 17
 18enum llama_fver {
 19    GGUF_FILE_VERSION_V1 = 1,
 20    GGUF_FILE_VERSION_V2 = 2,
 21    GGUF_FILE_VERSION_V3 = 3,
 22};
 23
 24const char * llama_file_version_name(llama_fver version);
 25
 26struct llama_model_loader {
 27    // Holds information on a model weight
 28    struct llama_tensor_weight {
 29        uint16_t  idx; // source file index
 30        size_t   offs; // tensor data offset in the original file
 31
 32        ggml_tensor * tensor;
 33
 34        llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
 35            const int tensor_idx = gguf_find_tensor(gguf_ctx,  ggml_get_name(tensor));
 36            if (tensor_idx < 0) {
 37                throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
 38            }
 39
 40            offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
 41            if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
 42                throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
 43            }
 44        }
 45    };
 46
 47    // custom comparator to sort weights more nicely by layer
 48    struct weight_name_comparer {
 49        bool operator()(const std::string & a, const std::string & b) const {
 50            int a_layer = -1;
 51            int b_layer = -1;
 52            sscanf(a.c_str(), "blk.%d.", &a_layer);
 53            sscanf(b.c_str(), "blk.%d.", &b_layer);
 54            if (a_layer != b_layer) {
 55                return a_layer < b_layer;
 56            }
 57            return a < b;
 58        }
 59    };
 60
 61    static const int TENSOR_NOT_REQUIRED = 1 << 0;
 62    static const int TENSOR_DUPLICATED   = 1 << 1;
 63    static const int TENSOR_SKIP         = 1 << 2;
 64
 65    int n_kv      = 0;
 66    int n_tensors = 0;
 67    int n_created = 0;
 68
 69    uint64_t n_elements = 0;
 70    size_t   n_bytes    = 0;
 71
 72    bool use_mmap = false;
 73    bool use_direct_io = false;
 74    bool check_tensors;
 75    bool no_alloc;
 76
 77    llama_files files;
 78    llama_ftype ftype;
 79    llama_fver  fver;
 80
 81    llama_mmaps mappings;
 82
 83    std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
 84    std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
 85    const llama_model_tensor_buft_override * tensor_buft_overrides;
 86
 87    gguf_context_ptr meta;
 88    std::vector<ggml_context_ptr> contexts;
 89
 90    std::string arch_name;
 91    LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
 92
 93    size_t size_done = 0;
 94    size_t size_data = 0;
 95    std::vector<std::pair<size_t, size_t>> mmaps_used;
 96
 97    llama_model_loader(
 98        const std::string & fname,
 99        std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
100        bool use_mmap,
101        bool use_direct_io,
102        bool check_tensors,
103        bool no_alloc,
104        const llama_model_kv_override * param_overrides_p,
105        const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
106
107    template<typename T>
108    typename std::enable_if<std::is_integral<T>::value, bool>::type
109    get_arr_n(const std::string & key, T & result, bool required = true);
110
111    template<typename T>
112    typename std::enable_if<std::is_integral<T>::value, bool>::type
113    get_arr_n(enum llm_kv kid, T & result, bool required = true);
114
115    template<typename T>
116    bool get_arr(const std::string & key, std::vector<T> & result, bool required = true);
117
118    template<typename T, size_t N_MAX>
119    bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true);
120
121    template<typename T>
122    bool get_arr(enum llm_kv kid, T & result, bool required = true);
123
124    template<typename T>
125    bool get_key(const std::string & key, T & result, bool required = true);
126
127    template<typename T>
128    bool get_key(enum llm_kv kid, T & result, bool required = true);
129
130    template<typename T, size_t N_MAX>
131    bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true);
132
133    template<typename T>
134    bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
135
136    bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
137
138    std::string get_arch_name() const;
139
140    enum llm_arch get_arch() const;
141
142    const llama_tensor_weight * get_weight(const char * name) const;
143
144    const llama_tensor_weight & require_weight(const char * name) const;
145
146    struct ggml_tensor * get_tensor_meta(const char * name) const;
147
148    struct ggml_tensor * require_tensor_meta(const std::string & name) const;
149
150    const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
151
152    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
153
154    struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
155
156    void done_getting_tensors() const;
157
158    void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);
159
160    void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
161
162    // for backwards compatibility, does not support ggml-backend
163    void load_data_for(struct ggml_tensor * cur) const;
164
165    // Returns false if cancelled by progress_callback
166    bool load_all_data(
167            struct ggml_context * ctx,
168            llama_buf_map & bufs,
169            llama_mlocks * lmlocks,
170            llama_progress_callback progress_callback,
171            void * progress_callback_user_data);
172
173    std::string ftype_name() const;
174
175    void print_info() const;
176};