Engage!

author: Mitja Felicijan <mitja.felicijan@gmail.com> 2026-02-12 20:57:17 +0100
committer: Mitja Felicijan <mitja.felicijan@gmail.com> 2026-02-12 20:57:17 +0100
commit: b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree: 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/src/llama-adapter.cpp
download: llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
1 files changed, 488 insertions, 0 deletions
diff --git a/llama.cpp/src/llama-adapter.cpp b/llama.cpp/src/llama-adapter.cpp
new file mode 100644
index 0000000..d6a5800
--- /dev/null
+++ b/llama.cpp/src/llama-adapter.cpp
@@ -0,0 +1,488 @@
+#include "llama-adapter.h"
+#include "llama-impl.h"
+#include "llama-mmap.h"
+#include "llama-model.h"
+#include <map>
+#include <cassert>
+#include <sstream>
+#include <stdexcept>
+// vec
+ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
+    if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
+        return nullptr;
+    }
+    return tensors[il];
+}
+ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int  il) const {
+    ggml_tensor * layer_dir = tensor_for(il);
+    if (layer_dir != nullptr) {
+        cur = ggml_add(ctx, cur, layer_dir);
+    }
+    return cur;
+}
+bool llama_adapter_cvec::init(const llama_model & model) {
+    const auto & hparams = model.hparams;
+    GGML_ASSERT(tensors.empty());
+    GGML_ASSERT(ctxs.empty());
+    GGML_ASSERT(bufs.empty());
+    // create a context for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ hparams.n_layer*ggml_tensor_overhead(),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+            ctx_map[buft] = ctx;
+            ctxs.emplace_back(ctx);
+            return ctx;
+        }
+        return it->second;
+    };
+    // make tensors
+    tensors.reserve(hparams.n_layer);
+    tensors.push_back(nullptr); // there's never a tensor for layer 0
+    for (size_t il = 1; il < hparams.n_layer; il++) {
+        ggml_backend_buffer_type_t buft = model.select_buft(il);
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
+            return false;
+        }
+        ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
+        tensors.push_back(tensor);
+    }
+    // allocate tensors / buffers and zero
+    bufs.reserve(ctx_map.size());
+    for (auto it : ctx_map) {
+        ggml_backend_buffer_type_t buft = it.first;
+        ggml_context * ctx = it.second;
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
+            return false;
+        }
+        ggml_backend_buffer_clear(buf, 0);
+        bufs.emplace_back(buf);
+    }
+    return true;
+}
+bool llama_adapter_cvec::apply(
+        const llama_model & model,
+        const float * data,
+        size_t len,
+        int32_t n_embd,
+        int32_t il_start,
+        int32_t il_end) {
+    const auto & hparams = model.hparams;
+    if (data == nullptr) {
+        // disable the current control vector (but leave allocated for later)
+        layer_start = -1;
+        layer_end   = -1;
+        return true;
+    }
+    if (n_embd != (int) hparams.n_embd) {
+        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
+        return false;
+    }
+    if (tensors.empty()) {
+        if (!init(model)) {
+            return false;
+        }
+    }
+    layer_start = il_start;
+    layer_end   = il_end;
+    for (size_t il = 1; il < hparams.n_layer; il++) {
+        assert(tensors[il] != nullptr);
+        const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
+        if (off + n_embd <= len) {
+            ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
+        }
+    }
+    return true;
+}
+// lora
+llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
+    const std::string name(w->name);
+    const auto pos = ab_map.find(name);
+    if (pos != ab_map.end()) {
+        return &pos->second;
+    }
+    return nullptr;
+}
+static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
+    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
+    ggml_context * ctx_init;
+    gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ true,
+        /* .ctx      = */ &ctx_init,
+    };
+    gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
+    if (!ctx_gguf) {
+        throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
+    }
+    ggml_context_ptr ctx { ctx_init };
+    // check metadata
+    {
+        const gguf_context * gguf_ctx = ctx_gguf.get();
+        LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
+        // get metadata as string
+        for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
+            gguf_type type = gguf_get_kv_type(gguf_ctx, i);
+            const std::string type_name =
+                type == GGUF_TYPE_ARRAY
+                ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
+                : gguf_type_name(type);
+            const char * name = gguf_get_key(gguf_ctx, i);
+            const std::string value = gguf_kv_to_str(gguf_ctx, i);
+            if (type != GGUF_TYPE_ARRAY) {
+                adapter.gguf_kv.emplace(name, value);
+            }
+            const size_t MAX_VALUE_LEN = 40;
+            std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
+            replace_all(print_value, "\n", "\\n");
+            LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
+        }
+        auto get_kv_str = [&](const std::string & key) -> std::string {
+            int id = gguf_find_key(gguf_ctx, key.c_str());
+            return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
+        };
+        auto get_kv_f32 = [&](const std::string & key) -> float {
+            int id = gguf_find_key(gguf_ctx, key.c_str());
+            return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
+        };
+        LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
+        auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
+        if (general_type != "adapter") {
+            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
+        }
+        auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
+        auto general_arch = llm_arch_from_string(general_arch_str);
+        if (general_arch != model.arch) {
+            throw std::runtime_error("model arch and LoRA arch mismatch");
+        }
+        auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
+        if (adapter_type != "lora") {
+            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
+        }
+        adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
+        // parse alora invocation sequence vector
+        const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
+        const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
+        if (kid >= 0) {
+            if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
+                throw std::runtime_error("invalid gguf type for " + key);
+            }
+            const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
+            if (arr_type != GGUF_TYPE_UINT32) {
+                throw std::runtime_error("invalid gguf element type for " + key);
+            }
+            const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
+            const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
+            adapter.alora_invocation_tokens.resize(seq_len);
+            std::copy(
+                (const llama_token *)data,
+                (const llama_token *)data + seq_len,
+                adapter.alora_invocation_tokens.begin());
+        }
+    }
+    int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
+    // contexts for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            // add a new context
+            ggml_init_params params = {
+                /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            ggml_context * buft_ctx = ggml_init(params);
+            if (!buft_ctx) {
+                return nullptr;
+            }
+            ctx_map[buft] = buft_ctx;
+            adapter.ctxs.emplace_back(buft_ctx);
+            return buft_ctx;
+        };
+        return it->second;
+    };
+    // bundle lora_a and lora_b into pairs
+    std::map<std::string, llama_adapter_lora_weight> ab_map;
+    auto str_endswith = [](const std::string & str, const std::string & suffix) {
+        return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+    };
+    for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
+        std::string name(cur->name);
+        if (str_endswith(name, ".lora_a")) {
+            replace_all(name, ".lora_a", "");
+            if (ab_map.find(name) == ab_map.end()) {
+                ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
+            } else {
+                ab_map[name].a = cur;
+            }
+        } else if (str_endswith(name, ".lora_b")) {
+            replace_all(name, ".lora_b", "");
+            if (ab_map.find(name) == ab_map.end()) {
+                ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
+            } else {
+                ab_map[name].b = cur;
+            }
+        } else if (str_endswith(name, "_norm.weight")) {
+            // TODO: add support for norm vector
+            // for now, we don't really care because most adapters still work fine without it
+            continue;
+        } else {
+            throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
+        }
+    }
+    // get extra buffer types of the CPU
+    // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
+    std::vector<ggml_backend_buffer_type_t> buft_extra;
+    {
+        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        if (!cpu_dev) {
+            throw std::runtime_error(format("%s: no CPU backend found", __func__));
+        }
+        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
+            ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
+        if (ggml_backend_dev_get_extra_bufts_fn) {
+            ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
+            while (extra_bufts && *extra_bufts) {
+                buft_extra.emplace_back(*extra_bufts);
+                ++extra_bufts;
+            }
+        }
+    }
+    // add tensors
+    for (auto & it : ab_map) {
+        const std::string & name = it.first;
+        llama_adapter_lora_weight & w = it.second;
+        bool is_token_embd = str_endswith(name, "token_embd.weight");
+        if (!w.a || !w.b) {
+            throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
+        }
+        // device buft and device ctx
+        const auto * model_tensor = model.get_tensor(name.c_str());
+        if (!model_tensor) {
+            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
+        }
+        auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
+        // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
+        for (auto & ex : buft_extra) {
+            if (ex == buft) {
+                LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
+                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+                if (!cpu_dev) {
+                    throw std::runtime_error(format("%s: no CPU backend found", __func__));
+                }
+                buft = ggml_backend_dev_buffer_type(cpu_dev);
+                break;
+            }
+        }
+        LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
+        ggml_context * dev_ctx = ctx_for_buft(buft);
+        // validate tensor shape
+        if (is_token_embd) {
+            // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
+            if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
+                throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
+            }
+        } else {
+            if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
+                throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
+            }
+            if (w.a->ne[1] != w.b->ne[0]) {
+                throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
+            }
+        }
+        // save tensor to adapter
+        ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
+        ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
+        ggml_set_name(tensor_a, w.a->name);
+        ggml_set_name(tensor_b, w.b->name);
+        adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
+    }
+    // allocate tensors / buffers and zero
+    {
+        adapter.ctxs.reserve(ctx_map.size());
+        adapter.bufs.reserve(ctx_map.size());
+        for (auto & it : ctx_map) {
+            ggml_backend_buffer_type_t buft = it.first;
+            ggml_context * ctx_dev = it.second;
+            ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
+            if (!buf) {
+                throw std::runtime_error("failed to allocate buffer for lora adapter\n");
+            }
+            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
+            adapter.bufs.emplace_back(std::move(buf));
+        }
+    }
+    // set tensor data
+    {
+        llama_file gguf_file(path_lora, "rb");
+        std::vector<uint8_t> read_buf;
+        auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
+            size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
+            size_t size = ggml_nbytes(orig);
+            read_buf.resize(size);
+            gguf_file.seek(offs, SEEK_SET);
+            gguf_file.read_raw(read_buf.data(), size);
+            ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
+        };
+        for (auto & it : adapter.ab_map) {
+            auto orig = ab_map[it.first];
+            auto dev  = it.second;
+            set_tensor(orig.a, dev.a);
+            set_tensor(orig.b, dev.b);
+        }
+    }
+    // register adapter with model
+    model.loras.insert(&adapter);
+    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
+}
+llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
+    llama_adapter_lora * adapter = new llama_adapter_lora();
+    try {
+        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
+        return adapter;
+    } catch (const std::exception & err) {
+        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
+        delete adapter;
+    }
+    return nullptr;
+}
+int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
+    const auto & it = adapter->gguf_kv.find(key);
+    if (it == adapter->gguf_kv.end()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
+    return (int)adapter->gguf_kv.size();
+}
+int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
+    if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    auto it = adapter->gguf_kv.begin();
+    std::advance(it, i);
+    return snprintf(buf, buf_size, "%s", it->first.c_str());
+}
+int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
+    if (i < 0 || i >= (int)adapter->gguf_kv.size()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    auto it = adapter->gguf_kv.begin();
+    std::advance(it, i);
+    return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+void llama_adapter_lora_free(llama_adapter_lora *) {
+    // deprecated: adapters are freed by llama_model's destructor
+}
+uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
+    if (!adapter) {
+        return 0;
+    }
+    return adapter->alora_invocation_tokens.size();
+}
+const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
+    GGML_ASSERT(adapter);
+    return adapter->alora_invocation_tokens.data();
+}
author	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-02-12 20:57:17 +0100
committer	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-02-12 20:57:17 +0100
commit	b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree	211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/src/llama-adapter.cpp
download	llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz

diff --git a/llama.cpp/src/llama-adapter.cpp b/llama.cpp/src/llama-adapter.cpp new file mode 100644 index 0000000..d6a5800 --- /dev/null +++ b/llama.cpp/src/llama-adapter.cpp
@@ -0,0 +1,488 @@
	1	#include "llama-adapter.h"
	2
	3	#include "llama-impl.h"
	4	#include "llama-mmap.h"
	5	#include "llama-model.h"
	6
	7	#include <map>
	8	#include <cassert>
	9	#include <sstream>
	10	#include <stdexcept>
	11
	12	// vec
	13
	14	ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
	15	if (il < 0 \|\| il < layer_start \|\| il > layer_end \|\| (size_t) il >= tensors.size()) {
	16	return nullptr;
	17	}
	18
	19	return tensors[il];
	20	}
	21
	22	ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
	23	ggml_tensor * layer_dir = tensor_for(il);
	24	if (layer_dir != nullptr) {
	25	cur = ggml_add(ctx, cur, layer_dir);
	26	}
	27
	28	return cur;
	29	}
	30
	31	bool llama_adapter_cvec::init(const llama_model & model) {
	32	const auto & hparams = model.hparams;
	33
	34	GGML_ASSERT(tensors.empty());
	35	GGML_ASSERT(ctxs.empty());
	36	GGML_ASSERT(bufs.empty());
	37
	38	// create a context for each buffer type
	39	std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
	40	auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
	41	auto it = ctx_map.find(buft);
	42	if (it == ctx_map.end()) {
	43	ggml_init_params params = {
	44	/.mem_size =/ hparams.n_layer*ggml_tensor_overhead(),
	45	/.mem_buffer =/ NULL,
	46	/.no_alloc =/ true,
	47	};
	48
	49	ggml_context * ctx = ggml_init(params);
	50	if (!ctx) {
	51	return nullptr;
	52	}
	53
	54	ctx_map[buft] = ctx;
	55	ctxs.emplace_back(ctx);
	56
	57	return ctx;
	58	}
	59
	60	return it->second;
	61	};
	62
	63	// make tensors
	64	tensors.reserve(hparams.n_layer);
	65	tensors.push_back(nullptr); // there's never a tensor for layer 0
	66	for (size_t il = 1; il < hparams.n_layer; il++) {
	67	ggml_backend_buffer_type_t buft = model.select_buft(il);
	68	ggml_context * ctx = ctx_for_buft(buft);
	69	if (!ctx) {
	70	LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
	71	return false;
	72	}
	73	ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
	74	tensors.push_back(tensor);
	75	}
	76
	77	// allocate tensors / buffers and zero
	78	bufs.reserve(ctx_map.size());
	79	for (auto it : ctx_map) {
	80	ggml_backend_buffer_type_t buft = it.first;
	81	ggml_context * ctx = it.second;
	82	ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
	83	if (!buf) {
	84	LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
	85	return false;
	86	}
	87	ggml_backend_buffer_clear(buf, 0);
	88	bufs.emplace_back(buf);
	89	}
	90
	91	return true;
	92	}
	93
	94	bool llama_adapter_cvec::apply(
	95	const llama_model & model,
	96	const float * data,
	97	size_t len,
	98	int32_t n_embd,
	99	int32_t il_start,
	100	int32_t il_end) {
	101	const auto & hparams = model.hparams;
	102
	103	if (data == nullptr) {
	104	// disable the current control vector (but leave allocated for later)
	105	layer_start = -1;
	106	layer_end = -1;
	107	return true;
	108	}
	109
	110	if (n_embd != (int) hparams.n_embd) {
	111	LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
	112	return false;
	113	}
	114
	115	if (tensors.empty()) {
	116	if (!init(model)) {
	117	return false;
	118	}
	119	}
	120
	121	layer_start = il_start;
	122	layer_end = il_end;
	123
	124	for (size_t il = 1; il < hparams.n_layer; il++) {
	125	assert(tensors[il] != nullptr);
	126
	127	const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
	128	if (off + n_embd <= len) {
	129	ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
	130	}
	131	}
	132
	133	return true;
	134	}
	135
	136	// lora
	137
	138	llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
	139	const std::string name(w->name);
	140
	141	const auto pos = ab_map.find(name);
	142	if (pos != ab_map.end()) {
	143	return &pos->second;
	144	}
	145
	146	return nullptr;
	147	}
	148
	149	static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
	150	LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
	151
	152	ggml_context * ctx_init;
	153	gguf_init_params meta_gguf_params = {
	154	/* .no_alloc = */ true,
	155	/* .ctx = */ &ctx_init,
	156	};
	157
	158	gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
	159	if (!ctx_gguf) {
	160	throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
	161	}
	162
	163	ggml_context_ptr ctx { ctx_init };
	164
	165	// check metadata
	166	{
	167	const gguf_context * gguf_ctx = ctx_gguf.get();
	168
	169	LLAMA_LOG_INFO("%s: Dumping metadata keys/values.\n", __func__);
	170
	171	// get metadata as string
	172	for (int i = 0; i < gguf_get_n_kv(gguf_ctx); i++) {
	173	gguf_type type = gguf_get_kv_type(gguf_ctx, i);
	174	const std::string type_name =
	175	type == GGUF_TYPE_ARRAY
	176	? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(gguf_ctx, i)), gguf_get_arr_n(gguf_ctx, i))
	177	: gguf_type_name(type);
	178	const char * name = gguf_get_key(gguf_ctx, i);
	179	const std::string value = gguf_kv_to_str(gguf_ctx, i);
	180
	181	if (type != GGUF_TYPE_ARRAY) {
	182	adapter.gguf_kv.emplace(name, value);
	183	}
	184
	185	const size_t MAX_VALUE_LEN = 40;
	186	std::string print_value = value.size() > MAX_VALUE_LEN ? format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()) : value;
	187	replace_all(print_value, "\n", "\\n");
	188
	189	LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), print_value.c_str());
	190	}
	191
	192	auto get_kv_str = [&](const std::string & key) -> std::string {
	193	int id = gguf_find_key(gguf_ctx, key.c_str());
	194	return id < 0 ? "" : std::string(gguf_get_val_str(gguf_ctx, id));
	195	};
	196	auto get_kv_f32 = [&](const std::string & key) -> float {
	197	int id = gguf_find_key(gguf_ctx, key.c_str());
	198	return id < 0 ? 0.0f : gguf_get_val_f32(gguf_ctx, id);
	199	};
	200	LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
	201
	202	auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
	203	if (general_type != "adapter") {
	204	throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
	205	}
	206
	207	auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
	208	auto general_arch = llm_arch_from_string(general_arch_str);
	209	if (general_arch != model.arch) {
	210	throw std::runtime_error("model arch and LoRA arch mismatch");
	211	}
	212
	213	auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
	214	if (adapter_type != "lora") {
	215	throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
	216	}
	217
	218	adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
	219
	220	// parse alora invocation sequence vector
	221	const auto & key = llm_kv(LLM_KV_ADAPTER_ALORA_INVOCATION_TOKENS);
	222	const int kid = gguf_find_key(ctx_gguf.get(), key.c_str());
	223	if (kid >= 0) {
	224	if (gguf_get_kv_type(ctx_gguf.get(), kid) != GGUF_TYPE_ARRAY) {
	225	throw std::runtime_error("invalid gguf type for " + key);
	226	}
	227	const auto arr_type = gguf_get_arr_type(ctx_gguf.get(), kid);
	228	if (arr_type != GGUF_TYPE_UINT32) {
	229	throw std::runtime_error("invalid gguf element type for " + key);
	230	}
	231	const size_t seq_len = gguf_get_arr_n(ctx_gguf.get(), kid);
	232	const void * data = gguf_get_arr_data(ctx_gguf.get(), kid);
	233	adapter.alora_invocation_tokens.resize(seq_len);
	234	std::copy(
	235	(const llama_token *)data,
	236	(const llama_token *)data + seq_len,
	237	adapter.alora_invocation_tokens.begin());
	238	}
	239	}
	240
	241	int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
	242
	243	// contexts for each buffer type
	244	std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
	245	auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
	246	auto it = ctx_map.find(buft);
	247	if (it == ctx_map.end()) {
	248	// add a new context
	249	ggml_init_params params = {
	250	/.mem_size =/ n_tensors*ggml_tensor_overhead(),
	251	/.mem_buffer =/ NULL,
	252	/.no_alloc =/ true,
	253	};
	254	ggml_context * buft_ctx = ggml_init(params);
	255	if (!buft_ctx) {
	256	return nullptr;
	257	}
	258	ctx_map[buft] = buft_ctx;
	259	adapter.ctxs.emplace_back(buft_ctx);
	260	return buft_ctx;
	261	};
	262	return it->second;
	263	};
	264
	265	// bundle lora_a and lora_b into pairs
	266	std::map<std::string, llama_adapter_lora_weight> ab_map;
	267	auto str_endswith = [](const std::string & str, const std::string & suffix) {
	268	return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
	269	};
	270
	271	for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
	272	std::string name(cur->name);
	273	if (str_endswith(name, ".lora_a")) {
	274	replace_all(name, ".lora_a", "");
	275	if (ab_map.find(name) == ab_map.end()) {
	276	ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
	277	} else {
	278	ab_map[name].a = cur;
	279	}
	280	} else if (str_endswith(name, ".lora_b")) {
	281	replace_all(name, ".lora_b", "");
	282	if (ab_map.find(name) == ab_map.end()) {
	283	ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
	284	} else {
	285	ab_map[name].b = cur;
	286	}
	287	} else if (str_endswith(name, "_norm.weight")) {
	288	// TODO: add support for norm vector
	289	// for now, we don't really care because most adapters still work fine without it
	290	continue;
	291	} else {
	292	throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
	293	}
	294	}
	295
	296	// get extra buffer types of the CPU
	297	// TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
	298	// ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
	299	std::vector<ggml_backend_buffer_type_t> buft_extra;
	300	{
	301	auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
	302	if (!cpu_dev) {
	303	throw std::runtime_error(format("%s: no CPU backend found", __func__));
	304	}
	305	auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
	306
	307	auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
	308	ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
	309
	310	if (ggml_backend_dev_get_extra_bufts_fn) {
	311	ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
	312	while (extra_bufts && *extra_bufts) {
	313	buft_extra.emplace_back(*extra_bufts);
	314	++extra_bufts;
	315	}
	316	}
	317	}
	318
	319	// add tensors
	320	for (auto & it : ab_map) {
	321	const std::string & name = it.first;
	322	llama_adapter_lora_weight & w = it.second;
	323	bool is_token_embd = str_endswith(name, "token_embd.weight");
	324
	325	if (!w.a \|\| !w.b) {
	326	throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
	327	}
	328
	329	// device buft and device ctx
	330	const auto * model_tensor = model.get_tensor(name.c_str());
	331	if (!model_tensor) {
	332	throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
	333	}
	334
	335	auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
	336
	337	// do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
	338	for (auto & ex : buft_extra) {
	339	if (ex == buft) {
	340	LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
	341
	342	auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
	343	if (!cpu_dev) {
	344	throw std::runtime_error(format("%s: no CPU backend found", __func__));
	345	}
	346	buft = ggml_backend_dev_buffer_type(cpu_dev);
	347
	348	break;
	349	}
	350	}
	351
	352	LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
	353
	354	ggml_context * dev_ctx = ctx_for_buft(buft);
	355	// validate tensor shape
	356	if (is_token_embd) {
	357	// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
	358	if (model_tensor->ne[0] != w.b->ne[1] \|\| model_tensor->ne[1] != w.a->ne[1]) {
	359	throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
	360	}
	361	} else {
	362	if (model_tensor->ne[0] != w.a->ne[0] \|\| model_tensor->ne[1] != w.b->ne[1]) {
	363	throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
	364	}
	365	if (w.a->ne[1] != w.b->ne[0]) {
	366	throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
	367	}
	368	}
	369
	370	// save tensor to adapter
	371	ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
	372	ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
	373	ggml_set_name(tensor_a, w.a->name);
	374	ggml_set_name(tensor_b, w.b->name);
	375	adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
	376	}
	377
	378	// allocate tensors / buffers and zero
	379	{
	380	adapter.ctxs.reserve(ctx_map.size());
	381	adapter.bufs.reserve(ctx_map.size());
	382	for (auto & it : ctx_map) {
	383	ggml_backend_buffer_type_t buft = it.first;
	384	ggml_context * ctx_dev = it.second;
	385	ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
	386	if (!buf) {
	387	throw std::runtime_error("failed to allocate buffer for lora adapter\n");
	388	}
	389	LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
	390	adapter.bufs.emplace_back(std::move(buf));
	391	}
	392	}
	393
	394	// set tensor data
	395	{
	396	llama_file gguf_file(path_lora, "rb");
	397	std::vector<uint8_t> read_buf;
	398	auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
	399	size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
	400	size_t size = ggml_nbytes(orig);
	401	read_buf.resize(size);
	402	gguf_file.seek(offs, SEEK_SET);
	403	gguf_file.read_raw(read_buf.data(), size);
	404	ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
	405	};
	406	for (auto & it : adapter.ab_map) {
	407	auto orig = ab_map[it.first];
	408	auto dev = it.second;
	409	set_tensor(orig.a, dev.a);
	410	set_tensor(orig.b, dev.b);
	411	}
	412	}
	413
	414	// register adapter with model
	415	model.loras.insert(&adapter);
	416
	417	LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
	418	}
	419
	420	llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
	421	llama_adapter_lora * adapter = new llama_adapter_lora();
	422
	423	try {
	424	llama_adapter_lora_init_impl(model, path_lora, adapter);
	425	return adapter;
	426	} catch (const std::exception & err) {
	427	LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
	428
	429	delete adapter;
	430	}
	431
	432	return nullptr;
	433	}
	434
	435	int32_t llama_adapter_meta_val_str(const llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size) {
	436	const auto & it = adapter->gguf_kv.find(key);
	437	if (it == adapter->gguf_kv.end()) {
	438	if (buf_size > 0) {
	439	buf[0] = '\0';
	440	}
	441	return -1;
	442	}
	443	return snprintf(buf, buf_size, "%s", it->second.c_str());
	444	}
	445
	446	int32_t llama_adapter_meta_count(const llama_adapter_lora * adapter) {
	447	return (int)adapter->gguf_kv.size();
	448	}
	449
	450	int32_t llama_adapter_meta_key_by_index(const llama_adapter_lora * adapter, int i, char * buf, size_t buf_size) {
	451	if (i < 0 \|\| i >= (int)adapter->gguf_kv.size()) {
	452	if (buf_size > 0) {
	453	buf[0] = '\0';
	454	}
	455	return -1;
	456	}
	457	auto it = adapter->gguf_kv.begin();
	458	std::advance(it, i);
	459	return snprintf(buf, buf_size, "%s", it->first.c_str());
	460	}
	461
	462	int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size) {
	463	if (i < 0 \|\| i >= (int)adapter->gguf_kv.size()) {
	464	if (buf_size > 0) {
	465	buf[0] = '\0';
	466	}
	467	return -1;
	468	}
	469	auto it = adapter->gguf_kv.begin();
	470	std::advance(it, i);
	471	return snprintf(buf, buf_size, "%s", it->second.c_str());
	472	}
	473
	474	void llama_adapter_lora_free(llama_adapter_lora *) {
	475	// deprecated: adapters are freed by llama_model's destructor
	476	}
	477
	478	uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
	479	if (!adapter) {
	480	return 0;
	481	}
	482	return adapter->alora_invocation_tokens.size();
	483	}
	484
	485	const llama_token * llama_adapter_get_alora_invocation_tokens(const llama_adapter_lora * adapter) {
	486	GGML_ASSERT(adapter);
	487	return adapter->alora_invocation_tokens.data();
	488	}