1 files changed, 508 insertions, 0 deletions
diff --git a/llama.cpp/tools/cvector-generator/cvector-generator.cpp b/llama.cpp/tools/cvector-generator/cvector-generator.cpp
new file mode 100644
index 0000000..3ba7c52
--- /dev/null
+++ b/llama.cpp/tools/cvector-generator/cvector-generator.cpp
@@ -0,0 +1,508 @@
+#include "ggml.h"
+#include "gguf.h"
+#include "arg.h"
+#include "common.h"
+#include "llama.h"
+#include "pca.hpp"
+#include "mean.hpp"
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+#include <algorithm>
+#include <climits>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <vector>
+//////////////////////////////////////////////////
+// utils
+template <class Iter>
+static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
+    std::string ret;
+    for (; begin != end; ++begin) {
+        ret += common_token_to_piece(ctx, *begin);
+    }
+    return ret;
+}
+static void print_usage(int, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
+    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
+    printf("\n    advanced:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
+    printf("\n    using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
+    printf("\n");
+}
+//////////////////////////////////////////////////
+// cb_eval is reused for each pair of positive - negative prompt
+struct callback_data {
+    ggml_context * ctx_ggml = nullptr;   // holds v_pos, v_neg, v_diff_filtered
+    int n_layers = 0;
+    int n_tokens = 0;
+    bool is_eval_pos = true;
+    // each element of the vector correspond to one layer
+    std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
+    std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
+    std::vector<struct ggml_tensor *> v_diff_filtered;   // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
+    // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
+    void save_tensor_for_layer(struct ggml_tensor * t) {
+        GGML_ASSERT(t->type == GGML_TYPE_F32);
+        if (ctx_ggml == nullptr) {
+            // alloc a new ctx_ggml if needed
+            struct ggml_init_params params_ggml = {
+                /*.mem_size   =*/ ggml_tensor_overhead() * n_layers * 3u,
+                /*.mem_buffer =*/ NULL,
+                /*.no_alloc   =*/ true,
+            };
+            ctx_ggml = ggml_init(params_ggml);
+        }
+        // copy tensor data
+        auto n_bytes = ggml_nbytes(t);
+        struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
+        t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
+        ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
+        ggml_set_name(t_layer, ggml_get_name(t));
+        //print_debug_tensor(t_layer);
+        if (is_eval_pos) {
+            v_pos.push_back(t_layer);
+        } else {
+            v_neg.push_back(t_layer);
+        }
+    }
+    // calculate diff (v_pos - v_neg) and place the result back to v_pos
+    // all zero rows in the diff tensor will also be removed
+    // NOTE: final layer is ignored. we only have (n_layers - 1) to process
+    std::vector<struct ggml_tensor *> calc_diff() {
+        for (float il = 0; il < v_pos.size(); il++) {
+            float * a = (float *) v_pos[il]->data;
+            float * b = (float *) v_neg[il]->data;
+            size_t n_elem = ggml_nelements(v_pos[il]);
+            for (size_t j = 0; j < n_elem; j++) {
+                a[j] -= b[j];
+            }
+            //print_debug_tensor(v_pos[i]);
+            auto diff_filtered = filter_nonzero_rows(v_pos[il]);
+            v_diff_filtered.push_back(diff_filtered);
+        }
+        return v_diff_filtered; // for convinient, we return the result std::vector
+    }
+    // delete zero rows from a given 2D tensor
+    struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
+        //printf("filter_nonzero_rows\n");
+        auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
+            // check if given row containing all zero elements
+            int n_cols = t->ne[0]; // hint: should be equal to n_embd
+            for (int col = 0; col < n_cols; ++col) {
+                if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
+                    return false;
+                }
+            }
+            return true;
+        };
+        std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
+        for (int i_row = 0; i_row < a->ne[1]; i_row++) {
+            if (!is_row_all_zeros(a, i_row, 1e-6)) {
+                rows_to_copy.push_back(i_row);
+            }
+        }
+        // get "n_nonzero_rows" for the output "diff_filtered"
+        int n_nonzero_rows = rows_to_copy.size();
+        //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
+        int n_embd = a->ne[0];
+        GGML_ASSERT(n_nonzero_rows > 0);
+        // diff_filtered: [n_embd, n_nonzero_rows]
+        struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
+            ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
+        ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
+        diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
+        // copy non-zero rows
+        for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
+            int src_row = rows_to_copy[dest_row];
+            for (int i = 0; i < n_embd; i++) {
+                float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
+                ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
+            }
+        }
+        //print_debug_tensor(diff_filtered);
+        return diff_filtered;
+    }
+    // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
+    void reset() {
+        for (auto ptr : v_pos) free(ptr->data);
+        for (auto ptr : v_neg) free(ptr->data);
+        for (auto ptr : v_diff_filtered) free(ptr->data);
+        v_pos.clear();
+        v_neg.clear();
+        v_diff_filtered.clear();
+        if (ctx_ggml) {
+            ggml_free(ctx_ggml);
+        }
+        ctx_ggml = nullptr;
+    }
+};
+/**
+ * process_ctx is used to store the ggml context for pre-post processing the diff vectors
+ * in short, input => v_diff and output => v_final
+ */
+struct train_context {
+    ggml_context * ctx_ggml;
+    int n_embd;
+    int n_layers;
+    /* pair of prompts to be used for generating final vector */
+    std::vector<std::string> positive_entries;
+    std::vector<std::string> negative_entries;
+    // each element of the vector correspond to one layer
+    // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
+    // NOTE (2): v_diff is transposed from v_diff_tmp
+    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
+    std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
+    // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
+    // v_diff_tmp will get converted unto v_diff later on
+    std::vector<std::vector<uint8_t>> v_diff_tmp;
+    train_context(int n_embd_, int n_layers_) {
+        n_embd = n_embd_;
+        n_layers = n_layers_;
+        struct ggml_init_params params_ggml = {
+            /*.mem_size   =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_ggml = ggml_init(params_ggml);
+        for (int il = 0; il < n_layers - 1; il++) {
+            std::vector<uint8_t> empty;
+            v_diff_tmp.push_back(empty);
+            auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
+            t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
+            v_final.push_back(t);
+        }
+    }
+    // add new rows into existing tensor in v_diff_tmp
+    void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
+        GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
+        for (int il = 0; il < n_layers - 1; il++) {
+            auto t = diff_filtered[il];
+            auto & diff_tmp = v_diff_tmp[il];
+            size_t curr_size = diff_tmp.size();
+            diff_tmp.resize(curr_size + ggml_nbytes(t));
+            memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
+        }
+    }
+    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
+    // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
+    void build_v_diff(bool transpose) {
+        printf("build_v_diff\n");
+        for (int il = 0; il < n_layers - 1; il++) {
+            auto & diff_tmp = v_diff_tmp[il];
+            int n_elem = diff_tmp.size() / sizeof(float);
+            GGML_ASSERT(n_elem % n_embd == 0);
+            int n_rows = n_elem / n_embd;
+            struct ggml_tensor * diff = transpose
+                ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
+                : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
+            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
+            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
+            if (transpose) {
+                // copy data & transpose
+                float * arr = (float *) diff_tmp.data();
+                for (int ir = 0; ir < n_rows; ++ir) {
+                    for (int ic = 0; ic < n_embd; ++ic) {
+                        float f = arr[ir*n_embd + ic];
+                        ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
+                    }
+                }
+            } else {
+                // only copy
+                memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
+            }
+            v_diff.push_back(diff);
+            print_debug_tensor(diff);
+            // free memory of diff_tmp
+            diff_tmp.resize(0);
+        }
+    }
+    ~train_context() {
+        for (auto ptr : v_final) free(ptr->data);
+        for (auto ptr : v_diff) free(ptr->data);
+        // no need to free v_diff_tmp, since we didn't use malloc
+        ggml_free(ctx_ggml);
+    }
+};
+struct tokenized_prompt {
+    std::vector<llama_token> tokens_pos;
+    std::vector<llama_token> tokens_neg;
+    size_t max_seq_len;
+    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
+        const llama_model * model = llama_get_model(ctx);
+        const llama_vocab * vocab = llama_model_get_vocab(model);
+        const bool add_bos = llama_vocab_get_add_bos(vocab);
+        tokens_pos = common_tokenize(ctx, pos, add_bos, true);
+        tokens_neg = common_tokenize(ctx, neg, add_bos, true);
+        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
+        padding_seq(ctx, tokens_pos, max_seq_len);
+        padding_seq(ctx, tokens_neg, max_seq_len);
+    }
+    void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
+        // TODO: customize padding token
+        std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
+        llama_token pad_tok = pad_tokens.back();
+        while (tokens.size() < len) {
+            tokens.push_back(pad_tok);
+        }
+    }
+};
+//////////////////////////////////////////////////
+template <typename T>
+static std::string to_string(const T & val) {
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+}
+static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
+    std::vector<std::string> output;
+    std::ifstream file(path);
+    if (!file.is_open()) {
+        fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
+        exit(1);
+    }
+    std::string line;
+    while (std::getline(file, line)) {
+        bool is_skip = skip_empty_lines && line.empty();
+        if (!is_skip) {
+            string_process_escapes(line);
+            output.push_back(line);
+        }
+    }
+    file.close();
+    return output;
+}
+//////////////////////////////////////////////////
+static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (callback_data *) user_data;
+    static const char * l_out_name = "l_out";
+    const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
+    if (ask) {
+        return is_l_out;
+    }
+    if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
+        return true;
+    }
+    // save the tensor to current context
+    cb_data->save_tensor_for_layer(t);
+    return true;
+}
+static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
+    llama_memory_clear(llama_get_memory(ctx), true);
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
+        fprintf(stderr, "%s : failed to eval\n", __func__);
+        return false;
+    }
+    return true;
+}
+static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
+    struct gguf_context * ctx = gguf_init_empty();
+    const std::string arch = "controlvector";
+    gguf_set_val_str(ctx, "general.architecture", arch.c_str());
+    gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
+    gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
+    for (size_t i = 0; i < v_ctrl.size(); ++i) {
+        gguf_add_tensor(ctx, v_ctrl[i]);
+        print_debug_tensor(v_ctrl[i]);
+        printf("Added tensor: %s\n", v_ctrl[i]->name);
+    }
+    printf("%s: writing file...\n", __func__);
+    gguf_write_to_file(ctx, fname.c_str(), false);
+    printf("%s: wrote file '%s'\n", __func__, fname.c_str());
+    gguf_free(ctx);
+}
+/**
+ * Load prompt files and completion file.
+ * Then format each pair of prompt + completion to make an entry.
+ */
+static int prepare_entries(common_params & params, train_context & ctx_train) {
+    // load prompts
+    std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
+    std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
+    if (positive_prompts.size() != negative_prompts.size()) {
+        fprintf(stderr, "number of positive and negative prompts must be equal\n");
+        return 1;
+    }
+    if (positive_prompts.empty()) {
+        fprintf(stderr, "must provide at least one prompt pair\n");
+        return 1;
+    }
+    ctx_train.positive_entries = positive_prompts;
+    ctx_train.negative_entries = negative_prompts;
+    return 0;
+}
+int main(int argc, char ** argv) {
+    common_params params;
+    params.out_file = "control_vector.gguf";
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
+        return 1;
+    }
+    if (params.n_pca_iterations % params.n_pca_batch != 0) {
+        fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
+        return 1;
+    }
+    callback_data cb_data;
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    params.cb_eval = cb_eval;
+    params.cb_eval_user_data = &cb_data;
+    params.warmup = false;
+    print_build_info();
+    llama_backend_init();
+    llama_numa_init(params.numa);
+    // load the model to get hparams
+    auto llama_init = common_init_from_params(params);
+    auto * model = llama_init->model();
+    auto * ctx   = llama_init->context();
+    // int n_ctx = llama_n_ctx(ctx);
+    int n_layers = llama_model_n_layer(model);
+    int n_embd = llama_model_n_embd(model);
+    // get model hint param (a.k.a model arch name)
+    char model_hint[128];
+    llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
+    // init train_context
+    train_context ctx_train(n_embd, n_layers);
+    // load and prepare entries for training
+    prepare_entries(params, ctx_train);
+    // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
+    std::vector<tokenized_prompt> tokenized_prompts;
+    size_t n_total_tokens = 0;
+    for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
+        tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
+        n_total_tokens += 2 * t.max_seq_len;
+        tokenized_prompts.push_back(std::move(t));
+    }
+    std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
+    for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
+        bool success = false;
+        tokenized_prompt t = tokenized_prompts[i];
+        cb_data.n_layers = n_layers;
+        cb_data.n_tokens = t.max_seq_len;
+        printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
+            (int) i+1, (int) ctx_train.positive_entries.size(),
+            tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
+            tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
+            (int) t.max_seq_len);
+        cb_data.is_eval_pos = true;
+        success = get_hidden_layers(ctx, t.tokens_pos);
+        if (!success) break;
+        cb_data.is_eval_pos = false;
+        success = get_hidden_layers(ctx, t.tokens_neg);
+        if (!success) break;
+        // calculate diff and remove all zero rows
+        auto v_diff_filtered = cb_data.calc_diff();
+        // save & concat the filtered v_diff to ctx_train
+        ctx_train.concat_diff_tmp(v_diff_filtered);
+        // reset for next iteration
+        cb_data.reset();
+    }
+    // done with the model, we can now free it to make gain some memory
+    printf("Done evaluate prompts, unload model...\n");
+    bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
+    // prepare ctx_train for PCA
+    ctx_train.build_v_diff(use_pca);
+    if (use_pca) {
+        // run PCA
+        PCA::pca_params pca_params;
+        pca_params.n_threads    = params.cpuparams.n_threads;
+        pca_params.n_batch      = params.n_pca_batch;
+        pca_params.n_iterations = params.n_pca_iterations;
+        PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
+    } else {
+        // run mean
+        mean::run(ctx_train.v_diff, ctx_train.v_final);
+    }
+    // write output vectors to gguf
+    export_gguf(ctx_train.v_final, params.out_file, model_hint);
+    llama_backend_free();
+    return 0;
+}

diff --git a/llama.cpp/tools/cvector-generator/cvector-generator.cpp b/llama.cpp/tools/cvector-generator/cvector-generator.cpp new file mode 100644 index 0000000..3ba7c52 --- /dev/null +++ b/llama.cpp/tools/cvector-generator/cvector-generator.cpp
@@ -0,0 +1,508 @@
	1	#include "ggml.h"
	2	#include "gguf.h"
	3
	4	#include "arg.h"
	5	#include "common.h"
	6	#include "llama.h"
	7	#include "pca.hpp"
	8	#include "mean.hpp"
	9
	10	#ifdef GGML_USE_CUDA
	11	#include "ggml-cuda.h"
	12	#endif
	13
	14	#ifdef GGML_USE_METAL
	15	#include "ggml-metal.h"
	16	#endif
	17
	18	#include <algorithm>
	19	#include <climits>
	20	#include <cstdio>
	21	#include <cstring>
	22	#include <fstream>
	23	#include <iostream>
	24	#include <string>
	25	#include <tuple>
	26	#include <vector>
	27
	28
	29	//////////////////////////////////////////////////
	30	// utils
	31
	32	template <class Iter>
	33	static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
	34	std::string ret;
	35	for (; begin != end; ++begin) {
	36	ret += common_token_to_piece(ctx, *begin);
	37	}
	38
	39	return ret;
	40	}
	41
	42	static void print_usage(int, char ** argv) {
	43	printf("\nexample usage:\n");
	44	printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
	45	printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
	46	printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
	47	printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
	48	printf("\n");
	49	}
	50
	51	//////////////////////////////////////////////////
	52
	53
	54	// cb_eval is reused for each pair of positive - negative prompt
	55	struct callback_data {
	56	ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered
	57
	58	int n_layers = 0;
	59	int n_tokens = 0;
	60	bool is_eval_pos = true;
	61
	62	// each element of the vector correspond to one layer
	63	std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
	64	std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
	65	std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
	66
	67	// save a tensor into either v_pos or v_neg (decided by is_eval_pos)
	68	void save_tensor_for_layer(struct ggml_tensor * t) {
	69	GGML_ASSERT(t->type == GGML_TYPE_F32);
	70
	71	if (ctx_ggml == nullptr) {
	72	// alloc a new ctx_ggml if needed
	73	struct ggml_init_params params_ggml = {
	74	/.mem_size =/ ggml_tensor_overhead() * n_layers * 3u,
	75	/.mem_buffer =/ NULL,
	76	/.no_alloc =/ true,
	77	};
	78	ctx_ggml = ggml_init(params_ggml);
	79	}
	80
	81	// copy tensor data
	82	auto n_bytes = ggml_nbytes(t);
	83	struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
	84	t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
	85	ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
	86	ggml_set_name(t_layer, ggml_get_name(t));
	87	//print_debug_tensor(t_layer);
	88
	89	if (is_eval_pos) {
	90	v_pos.push_back(t_layer);
	91	} else {
	92	v_neg.push_back(t_layer);
	93	}
	94	}
	95
	96	// calculate diff (v_pos - v_neg) and place the result back to v_pos
	97	// all zero rows in the diff tensor will also be removed
	98	// NOTE: final layer is ignored. we only have (n_layers - 1) to process
	99	std::vector<struct ggml_tensor *> calc_diff() {
	100	for (float il = 0; il < v_pos.size(); il++) {
	101	float * a = (float *) v_pos[il]->data;
	102	float * b = (float *) v_neg[il]->data;
	103	size_t n_elem = ggml_nelements(v_pos[il]);
	104	for (size_t j = 0; j < n_elem; j++) {
	105	a[j] -= b[j];
	106	}
	107	//print_debug_tensor(v_pos[i]);
	108	auto diff_filtered = filter_nonzero_rows(v_pos[il]);
	109	v_diff_filtered.push_back(diff_filtered);
	110	}
	111	return v_diff_filtered; // for convinient, we return the result std::vector
	112	}
	113
	114	// delete zero rows from a given 2D tensor
	115	struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
	116	//printf("filter_nonzero_rows\n");
	117	auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
	118	// check if given row containing all zero elements
	119	int n_cols = t->ne[0]; // hint: should be equal to n_embd
	120	for (int col = 0; col < n_cols; ++col) {
	121	if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
	122	return false;
	123	}
	124	}
	125	return true;
	126	};
	127	std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
	128	for (int i_row = 0; i_row < a->ne[1]; i_row++) {
	129	if (!is_row_all_zeros(a, i_row, 1e-6)) {
	130	rows_to_copy.push_back(i_row);
	131	}
	132	}
	133
	134	// get "n_nonzero_rows" for the output "diff_filtered"
	135	int n_nonzero_rows = rows_to_copy.size();
	136	//printf("n_nonzero_rows: %d\n", n_nonzero_rows);
	137	int n_embd = a->ne[0];
	138	GGML_ASSERT(n_nonzero_rows > 0);
	139
	140	// diff_filtered: [n_embd, n_nonzero_rows]
	141	struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
	142	ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
	143	ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
	144	diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
	145
	146	// copy non-zero rows
	147	for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
	148	int src_row = rows_to_copy[dest_row];
	149	for (int i = 0; i < n_embd; i++) {
	150	float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
	151	ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
	152	}
	153	}
	154
	155	//print_debug_tensor(diff_filtered);
	156
	157	return diff_filtered;
	158	}
	159
	160	// we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
	161	void reset() {
	162	for (auto ptr : v_pos) free(ptr->data);
	163	for (auto ptr : v_neg) free(ptr->data);
	164	for (auto ptr : v_diff_filtered) free(ptr->data);
	165	v_pos.clear();
	166	v_neg.clear();
	167	v_diff_filtered.clear();
	168	if (ctx_ggml) {
	169	ggml_free(ctx_ggml);
	170	}
	171	ctx_ggml = nullptr;
	172	}
	173	};
	174
	175	/**
	176	* process_ctx is used to store the ggml context for pre-post processing the diff vectors
	177	* in short, input => v_diff and output => v_final
	178	*/
	179	struct train_context {
	180	ggml_context * ctx_ggml;
	181	int n_embd;
	182	int n_layers;
	183
	184	/* pair of prompts to be used for generating final vector */
	185	std::vector<std::string> positive_entries;
	186	std::vector<std::string> negative_entries;
	187
	188	// each element of the vector correspond to one layer
	189	// NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
	190	// NOTE (2): v_diff is transposed from v_diff_tmp
	191	std::vector<struct ggml_tensor > v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens n_completions (v_diff contains no zero-rows)
	192	std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
	193
	194	// to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
	195	// v_diff_tmp will get converted unto v_diff later on
	196	std::vector<std::vector<uint8_t>> v_diff_tmp;
	197
	198	train_context(int n_embd_, int n_layers_) {
	199	n_embd = n_embd_;
	200	n_layers = n_layers_;
	201	struct ggml_init_params params_ggml = {
	202	/.mem_size =/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
	203	/.mem_buffer =/ NULL,
	204	/.no_alloc =/ true,
	205	};
	206	ctx_ggml = ggml_init(params_ggml);
	207	for (int il = 0; il < n_layers - 1; il++) {
	208	std::vector<uint8_t> empty;
	209	v_diff_tmp.push_back(empty);
	210	auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
	211	t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
	212	v_final.push_back(t);
	213	}
	214	}
	215
	216	// add new rows into existing tensor in v_diff_tmp
	217	void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
	218	GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
	219	for (int il = 0; il < n_layers - 1; il++) {
	220	auto t = diff_filtered[il];
	221	auto & diff_tmp = v_diff_tmp[il];
	222	size_t curr_size = diff_tmp.size();
	223	diff_tmp.resize(curr_size + ggml_nbytes(t));
	224	memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
	225	}
	226	}
	227
	228	// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
	229	// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
	230	void build_v_diff(bool transpose) {
	231	printf("build_v_diff\n");
	232	for (int il = 0; il < n_layers - 1; il++) {
	233	auto & diff_tmp = v_diff_tmp[il];
	234	int n_elem = diff_tmp.size() / sizeof(float);
	235	GGML_ASSERT(n_elem % n_embd == 0);
	236	int n_rows = n_elem / n_embd;
	237	struct ggml_tensor * diff = transpose
	238	? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
	239	: ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
	240	ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
	241	diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
	242	if (transpose) {
	243	// copy data & transpose
	244	float * arr = (float *) diff_tmp.data();
	245	for (int ir = 0; ir < n_rows; ++ir) {
	246	for (int ic = 0; ic < n_embd; ++ic) {
	247	float f = arr[ir*n_embd + ic];
	248	ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
	249	}
	250	}
	251	} else {
	252	// only copy
	253	memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
	254	}
	255	v_diff.push_back(diff);
	256	print_debug_tensor(diff);
	257	// free memory of diff_tmp
	258	diff_tmp.resize(0);
	259	}
	260	}
	261
	262	~train_context() {
	263	for (auto ptr : v_final) free(ptr->data);
	264	for (auto ptr : v_diff) free(ptr->data);
	265	// no need to free v_diff_tmp, since we didn't use malloc
	266	ggml_free(ctx_ggml);
	267	}
	268	};
	269
	270	struct tokenized_prompt {
	271	std::vector<llama_token> tokens_pos;
	272	std::vector<llama_token> tokens_neg;
	273	size_t max_seq_len;
	274
	275	tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
	276	const llama_model * model = llama_get_model(ctx);
	277	const llama_vocab * vocab = llama_model_get_vocab(model);
	278	const bool add_bos = llama_vocab_get_add_bos(vocab);
	279	tokens_pos = common_tokenize(ctx, pos, add_bos, true);
	280	tokens_neg = common_tokenize(ctx, neg, add_bos, true);
	281	max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
	282	padding_seq(ctx, tokens_pos, max_seq_len);
	283	padding_seq(ctx, tokens_neg, max_seq_len);
	284	}
	285
	286	void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
	287	// TODO: customize padding token
	288	std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
	289	llama_token pad_tok = pad_tokens.back();
	290	while (tokens.size() < len) {
	291	tokens.push_back(pad_tok);
	292	}
	293	}
	294	};
	295
	296	//////////////////////////////////////////////////
	297
	298	template <typename T>
	299	static std::string to_string(const T & val) {
	300	std::stringstream ss;
	301	ss << val;
	302	return ss.str();
	303	}
	304
	305	static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
	306	std::vector<std::string> output;
	307	std::ifstream file(path);
	308	if (!file.is_open()) {
	309	fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
	310	exit(1);
	311	}
	312	std::string line;
	313	while (std::getline(file, line)) {
	314	bool is_skip = skip_empty_lines && line.empty();
	315	if (!is_skip) {
	316	string_process_escapes(line);
	317	output.push_back(line);
	318	}
	319	}
	320	file.close();
	321	return output;
	322	}
	323
	324	//////////////////////////////////////////////////
	325
	326	static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
	327	auto * cb_data = (callback_data *) user_data;
	328	static const char * l_out_name = "l_out";
	329	const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
	330
	331	if (ask) {
	332	return is_l_out;
	333	}
	334
	335	if (!is_l_out \|\| t->ne[1] != cb_data->n_tokens) {
	336	return true;
	337	}
	338
	339	// save the tensor to current context
	340	cb_data->save_tensor_for_layer(t);
	341	return true;
	342	}
	343
	344	static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
	345	llama_memory_clear(llama_get_memory(ctx), true);
	346	if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
	347	fprintf(stderr, "%s : failed to eval\n", __func__);
	348	return false;
	349	}
	350	return true;
	351	}
	352
	353	static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
	354	struct gguf_context * ctx = gguf_init_empty();
	355
	356	const std::string arch = "controlvector";
	357	gguf_set_val_str(ctx, "general.architecture", arch.c_str());
	358	gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
	359	gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
	360
	361	for (size_t i = 0; i < v_ctrl.size(); ++i) {
	362	gguf_add_tensor(ctx, v_ctrl[i]);
	363	print_debug_tensor(v_ctrl[i]);
	364	printf("Added tensor: %s\n", v_ctrl[i]->name);
	365	}
	366
	367	printf("%s: writing file...\n", __func__);
	368	gguf_write_to_file(ctx, fname.c_str(), false);
	369	printf("%s: wrote file '%s'\n", __func__, fname.c_str());
	370	gguf_free(ctx);
	371	}
	372
	373	/**
	374	* Load prompt files and completion file.
	375	* Then format each pair of prompt + completion to make an entry.
	376	*/
	377	static int prepare_entries(common_params & params, train_context & ctx_train) {
	378	// load prompts
	379	std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
	380	std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
	381	if (positive_prompts.size() != negative_prompts.size()) {
	382	fprintf(stderr, "number of positive and negative prompts must be equal\n");
	383	return 1;
	384	}
	385	if (positive_prompts.empty()) {
	386	fprintf(stderr, "must provide at least one prompt pair\n");
	387	return 1;
	388	}
	389	ctx_train.positive_entries = positive_prompts;
	390	ctx_train.negative_entries = negative_prompts;
	391	return 0;
	392	}
	393
	394	int main(int argc, char ** argv) {
	395	common_params params;
	396
	397	params.out_file = "control_vector.gguf";
	398
	399	if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
	400	return 1;
	401	}
	402
	403	if (params.n_pca_iterations % params.n_pca_batch != 0) {
	404	fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
	405	return 1;
	406	}
	407
	408
	409	callback_data cb_data;
	410
	411	// pass the callback to the backend scheduler
	412	// it will be executed for each node during the graph computation
	413	params.cb_eval = cb_eval;
	414	params.cb_eval_user_data = &cb_data;
	415	params.warmup = false;
	416
	417	print_build_info();
	418	llama_backend_init();
	419	llama_numa_init(params.numa);
	420
	421	// load the model to get hparams
	422	auto llama_init = common_init_from_params(params);
	423
	424	auto * model = llama_init->model();
	425	auto * ctx = llama_init->context();
	426
	427	// int n_ctx = llama_n_ctx(ctx);
	428	int n_layers = llama_model_n_layer(model);
	429	int n_embd = llama_model_n_embd(model);
	430
	431	// get model hint param (a.k.a model arch name)
	432	char model_hint[128];
	433	llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
	434
	435	// init train_context
	436	train_context ctx_train(n_embd, n_layers);
	437
	438	// load and prepare entries for training
	439	prepare_entries(params, ctx_train);
	440
	441	// we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
	442	std::vector<tokenized_prompt> tokenized_prompts;
	443	size_t n_total_tokens = 0;
	444	for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
	445	tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
	446	n_total_tokens += 2 * t.max_seq_len;
	447	tokenized_prompts.push_back(std::move(t));
	448	}
	449
	450	std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
	451
	452	for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
	453	bool success = false;
	454	tokenized_prompt t = tokenized_prompts[i];
	455	cb_data.n_layers = n_layers;
	456	cb_data.n_tokens = t.max_seq_len;
	457
	458	printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
	459	(int) i+1, (int) ctx_train.positive_entries.size(),
	460	tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
	461	tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
	462	(int) t.max_seq_len);
	463
	464	cb_data.is_eval_pos = true;
	465	success = get_hidden_layers(ctx, t.tokens_pos);
	466	if (!success) break;
	467
	468	cb_data.is_eval_pos = false;
	469	success = get_hidden_layers(ctx, t.tokens_neg);
	470	if (!success) break;
	471
	472	// calculate diff and remove all zero rows
	473	auto v_diff_filtered = cb_data.calc_diff();
	474
	475	// save & concat the filtered v_diff to ctx_train
	476	ctx_train.concat_diff_tmp(v_diff_filtered);
	477
	478	// reset for next iteration
	479	cb_data.reset();
	480	}
	481
	482	// done with the model, we can now free it to make gain some memory
	483	printf("Done evaluate prompts, unload model...\n");
	484
	485	bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
	486
	487	// prepare ctx_train for PCA
	488	ctx_train.build_v_diff(use_pca);
	489
	490	if (use_pca) {
	491	// run PCA
	492	PCA::pca_params pca_params;
	493	pca_params.n_threads = params.cpuparams.n_threads;
	494	pca_params.n_batch = params.n_pca_batch;
	495	pca_params.n_iterations = params.n_pca_iterations;
	496	PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
	497	} else {
	498	// run mean
	499	mean::run(ctx_train.v_diff, ctx_train.v_final);
	500	}
	501
	502	// write output vectors to gguf
	503	export_gguf(ctx_train.v_final, params.out_file, model_hint);
	504
	505	llama_backend_free();
	506
	507	return 0;
	508	}