summaryrefslogtreecommitdiff
path: root/llama.cpp/src/llama-model-loader.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/src/llama-model-loader.h')
-rw-r--r--llama.cpp/src/llama-model-loader.h176
1 files changed, 176 insertions, 0 deletions
diff --git a/llama.cpp/src/llama-model-loader.h b/llama.cpp/src/llama-model-loader.h
new file mode 100644
index 0000000..65953dd
--- /dev/null
+++ b/llama.cpp/src/llama-model-loader.h
@@ -0,0 +1,176 @@
+#pragma once
+
+#include "llama.h"
+
+#include "llama-impl.h"
+#include "llama-arch.h"
+#include "llama-mmap.h"
+
+#include "ggml-cpp.h"
+
+#include <cstddef>
+#include <map>
+#include <stdexcept>
+#include <unordered_map>
+
+using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
+
+enum llama_fver {
+ GGUF_FILE_VERSION_V1 = 1,
+ GGUF_FILE_VERSION_V2 = 2,
+ GGUF_FILE_VERSION_V3 = 3,
+};
+
+const char * llama_file_version_name(llama_fver version);
+
+struct llama_model_loader {
+ // Holds information on a model weight
+ struct llama_tensor_weight {
+ uint16_t idx; // source file index
+ size_t offs; // tensor data offset in the original file
+
+ ggml_tensor * tensor;
+
+ llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
+ const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor));
+ if (tensor_idx < 0) {
+ throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
+ }
+
+ offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
+ if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size()) {
+ throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", ggml_get_name(tensor)));
+ }
+ }
+ };
+
+ // custom comparator to sort weights more nicely by layer
+ struct weight_name_comparer {
+ bool operator()(const std::string & a, const std::string & b) const {
+ int a_layer = -1;
+ int b_layer = -1;
+ sscanf(a.c_str(), "blk.%d.", &a_layer);
+ sscanf(b.c_str(), "blk.%d.", &b_layer);
+ if (a_layer != b_layer) {
+ return a_layer < b_layer;
+ }
+ return a < b;
+ }
+ };
+
+ static const int TENSOR_NOT_REQUIRED = 1 << 0;
+ static const int TENSOR_DUPLICATED = 1 << 1;
+ static const int TENSOR_SKIP = 1 << 2;
+
+ int n_kv = 0;
+ int n_tensors = 0;
+ int n_created = 0;
+
+ uint64_t n_elements = 0;
+ size_t n_bytes = 0;
+
+ bool use_mmap = false;
+ bool use_direct_io = false;
+ bool check_tensors;
+ bool no_alloc;
+
+ llama_files files;
+ llama_ftype ftype;
+ llama_fver fver;
+
+ llama_mmaps mappings;
+
+ std::map<std::string, llama_tensor_weight, weight_name_comparer> weights_map;
+ std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
+ const llama_model_tensor_buft_override * tensor_buft_overrides;
+
+ gguf_context_ptr meta;
+ std::vector<ggml_context_ptr> contexts;
+
+ std::string arch_name;
+ LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
+
+ size_t size_done = 0;
+ size_t size_data = 0;
+ std::vector<std::pair<size_t, size_t>> mmaps_used;
+
+ llama_model_loader(
+ const std::string & fname,
+ std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
+ bool use_mmap,
+ bool use_direct_io,
+ bool check_tensors,
+ bool no_alloc,
+ const llama_model_kv_override * param_overrides_p,
+ const llama_model_tensor_buft_override * param_tensor_buft_overrides_p);
+
+ template<typename T>
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
+ get_arr_n(const std::string & key, T & result, bool required = true);
+
+ template<typename T>
+ typename std::enable_if<std::is_integral<T>::value, bool>::type
+ get_arr_n(enum llm_kv kid, T & result, bool required = true);
+
+ template<typename T>
+ bool get_arr(const std::string & key, std::vector<T> & result, bool required = true);
+
+ template<typename T, size_t N_MAX>
+ bool get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required = true);
+
+ template<typename T>
+ bool get_arr(enum llm_kv kid, T & result, bool required = true);
+
+ template<typename T>
+ bool get_key(const std::string & key, T & result, bool required = true);
+
+ template<typename T>
+ bool get_key(enum llm_kv kid, T & result, bool required = true);
+
+ template<typename T, size_t N_MAX>
+ bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required = true);
+
+ template<typename T>
+ bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
+
+ bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
+
+ std::string get_arch_name() const;
+
+ enum llm_arch get_arch() const;
+
+ const llama_tensor_weight * get_weight(const char * name) const;
+
+ const llama_tensor_weight & require_weight(const char * name) const;
+
+ struct ggml_tensor * get_tensor_meta(const char * name) const;
+
+ struct ggml_tensor * require_tensor_meta(const std::string & name) const;
+
+ const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
+
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
+
+ struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
+
+ void done_getting_tensors() const;
+
+ void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr);
+
+ void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
+
+ // for backwards compatibility, does not support ggml-backend
+ void load_data_for(struct ggml_tensor * cur) const;
+
+ // Returns false if cancelled by progress_callback
+ bool load_all_data(
+ struct ggml_context * ctx,
+ llama_buf_map & bufs,
+ llama_mlocks * lmlocks,
+ llama_progress_callback progress_callback,
+ void * progress_callback_user_data);
+
+ std::string ftype_name() const;
+
+ void print_info() const;
+};