llmnpc - llama.cpp/tools/cvector-generator/cvector-generator.cpp

Path: llmnpc / llama.cpp / tools / cvector-generator / cvector-generator.cpp (raw)
  1#include "ggml.h"
  2#include "gguf.h"
  3
  4#include "arg.h"
  5#include "common.h"
  6#include "llama.h"
  7#include "pca.hpp"
  8#include "mean.hpp"
  9
 10#ifdef GGML_USE_CUDA
 11#include "ggml-cuda.h"
 12#endif
 13
 14#ifdef GGML_USE_METAL
 15#include "ggml-metal.h"
 16#endif
 17
 18#include <algorithm>
 19#include <climits>
 20#include <cstdio>
 21#include <cstring>
 22#include <fstream>
 23#include <iostream>
 24#include <string>
 25#include <tuple>
 26#include <vector>
 27
 28
 29//////////////////////////////////////////////////
 30// utils
 31
 32template <class Iter>
 33static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
 34    std::string ret;
 35    for (; begin != end; ++begin) {
 36        ret += common_token_to_piece(ctx, *begin);
 37    }
 38
 39    return ret;
 40}
 41
 42static void print_usage(int, char ** argv) {
 43    printf("\nexample usage:\n");
 44    printf("\n    CPU only:   %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
 45    printf("\n    with GPU:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
 46    printf("\n    advanced:   %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
 47    printf("\n    using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
 48    printf("\n");
 49}
 50
 51//////////////////////////////////////////////////
 52
 53
 54// cb_eval is reused for each pair of positive - negative prompt
 55struct callback_data {
 56    ggml_context * ctx_ggml = nullptr;   // holds v_pos, v_neg, v_diff_filtered
 57
 58    int n_layers = 0;
 59    int n_tokens = 0;
 60    bool is_eval_pos = true;
 61
 62    // each element of the vector correspond to one layer
 63    std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens]
 64    std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens]
 65    std::vector<struct ggml_tensor *> v_diff_filtered;   // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer
 66
 67    // save a tensor into either v_pos or v_neg (decided by is_eval_pos)
 68    void save_tensor_for_layer(struct ggml_tensor * t) {
 69        GGML_ASSERT(t->type == GGML_TYPE_F32);
 70
 71        if (ctx_ggml == nullptr) {
 72            // alloc a new ctx_ggml if needed
 73            struct ggml_init_params params_ggml = {
 74                /*.mem_size   =*/ ggml_tensor_overhead() * n_layers * 3u,
 75                /*.mem_buffer =*/ NULL,
 76                /*.no_alloc   =*/ true,
 77            };
 78            ctx_ggml = ggml_init(params_ggml);
 79        }
 80
 81        // copy tensor data
 82        auto n_bytes = ggml_nbytes(t);
 83        struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]);
 84        t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow
 85        ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes);
 86        ggml_set_name(t_layer, ggml_get_name(t));
 87        //print_debug_tensor(t_layer);
 88
 89        if (is_eval_pos) {
 90            v_pos.push_back(t_layer);
 91        } else {
 92            v_neg.push_back(t_layer);
 93        }
 94    }
 95
 96    // calculate diff (v_pos - v_neg) and place the result back to v_pos
 97    // all zero rows in the diff tensor will also be removed
 98    // NOTE: final layer is ignored. we only have (n_layers - 1) to process
 99    std::vector<struct ggml_tensor *> calc_diff() {
100        for (float il = 0; il < v_pos.size(); il++) {
101            float * a = (float *) v_pos[il]->data;
102            float * b = (float *) v_neg[il]->data;
103            size_t n_elem = ggml_nelements(v_pos[il]);
104            for (size_t j = 0; j < n_elem; j++) {
105                a[j] -= b[j];
106            }
107            //print_debug_tensor(v_pos[i]);
108            auto diff_filtered = filter_nonzero_rows(v_pos[il]);
109            v_diff_filtered.push_back(diff_filtered);
110        }
111        return v_diff_filtered; // for convinient, we return the result std::vector
112    }
113
114    // delete zero rows from a given 2D tensor
115    struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) {
116        //printf("filter_nonzero_rows\n");
117        auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool {
118            // check if given row containing all zero elements
119            int n_cols = t->ne[0]; // hint: should be equal to n_embd
120            for (int col = 0; col < n_cols; ++col) {
121                if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) {
122                    return false;
123                }
124            }
125            return true;
126        };
127        std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered)
128        for (int i_row = 0; i_row < a->ne[1]; i_row++) {
129            if (!is_row_all_zeros(a, i_row, 1e-6)) {
130                rows_to_copy.push_back(i_row);
131            }
132        }
133
134        // get "n_nonzero_rows" for the output "diff_filtered"
135        int n_nonzero_rows = rows_to_copy.size();
136        //printf("n_nonzero_rows: %d\n", n_nonzero_rows);
137        int n_embd = a->ne[0];
138        GGML_ASSERT(n_nonzero_rows > 0);
139
140        // diff_filtered: [n_embd, n_nonzero_rows]
141        struct ggml_tensor * diff_filtered = ggml_new_tensor_2d(
142            ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows);
143        ggml_format_name(diff_filtered, "diff_filtered_%s", a->name);
144        diff_filtered->data = malloc(ggml_nbytes(diff_filtered));
145
146        // copy non-zero rows
147        for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) {
148            int src_row = rows_to_copy[dest_row];
149            for (int i = 0; i < n_embd; i++) {
150                float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0);
151                ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem);
152            }
153        }
154
155        //print_debug_tensor(diff_filtered);
156
157        return diff_filtered;
158    }
159
160    // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors
161    void reset() {
162        for (auto ptr : v_pos) free(ptr->data);
163        for (auto ptr : v_neg) free(ptr->data);
164        for (auto ptr : v_diff_filtered) free(ptr->data);
165        v_pos.clear();
166        v_neg.clear();
167        v_diff_filtered.clear();
168        if (ctx_ggml) {
169            ggml_free(ctx_ggml);
170        }
171        ctx_ggml = nullptr;
172    }
173};
174
175/**
176 * process_ctx is used to store the ggml context for pre-post processing the diff vectors
177 * in short, input => v_diff and output => v_final
178 */
179struct train_context {
180    ggml_context * ctx_ggml;
181    int n_embd;
182    int n_layers;
183
184    /* pair of prompts to be used for generating final vector */
185    std::vector<std::string> positive_entries;
186    std::vector<std::string> negative_entries;
187
188    // each element of the vector correspond to one layer
189    // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here
190    // NOTE (2): v_diff is transposed from v_diff_tmp
191    std::vector<struct ggml_tensor *> v_diff;  // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows)
192    std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file
193
194    // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor
195    // v_diff_tmp will get converted unto v_diff later on
196    std::vector<std::vector<uint8_t>> v_diff_tmp;
197
198    train_context(int n_embd_, int n_layers_) {
199        n_embd = n_embd_;
200        n_layers = n_layers_;
201        struct ggml_init_params params_ggml = {
202            /*.mem_size   =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u,
203            /*.mem_buffer =*/ NULL,
204            /*.no_alloc   =*/ true,
205        };
206        ctx_ggml = ggml_init(params_ggml);
207        for (int il = 0; il < n_layers - 1; il++) {
208            std::vector<uint8_t> empty;
209            v_diff_tmp.push_back(empty);
210            auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd);
211            t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible
212            v_final.push_back(t);
213        }
214    }
215
216    // add new rows into existing tensor in v_diff_tmp
217    void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) {
218        GGML_ASSERT((int) diff_filtered.size() == n_layers - 1);
219        for (int il = 0; il < n_layers - 1; il++) {
220            auto t = diff_filtered[il];
221            auto & diff_tmp = v_diff_tmp[il];
222            size_t curr_size = diff_tmp.size();
223            diff_tmp.resize(curr_size + ggml_nbytes(t));
224            memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t));
225        }
226    }
227
228    // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
229    // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
230    void build_v_diff(bool transpose) {
231        printf("build_v_diff\n");
232        for (int il = 0; il < n_layers - 1; il++) {
233            auto & diff_tmp = v_diff_tmp[il];
234            int n_elem = diff_tmp.size() / sizeof(float);
235            GGML_ASSERT(n_elem % n_embd == 0);
236            int n_rows = n_elem / n_embd;
237            struct ggml_tensor * diff = transpose
238                ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
239                : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
240            ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
241            diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
242            if (transpose) {
243                // copy data & transpose
244                float * arr = (float *) diff_tmp.data();
245                for (int ir = 0; ir < n_rows; ++ir) {
246                    for (int ic = 0; ic < n_embd; ++ic) {
247                        float f = arr[ir*n_embd + ic];
248                        ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
249                    }
250                }
251            } else {
252                // only copy
253                memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
254            }
255            v_diff.push_back(diff);
256            print_debug_tensor(diff);
257            // free memory of diff_tmp
258            diff_tmp.resize(0);
259        }
260    }
261
262    ~train_context() {
263        for (auto ptr : v_final) free(ptr->data);
264        for (auto ptr : v_diff) free(ptr->data);
265        // no need to free v_diff_tmp, since we didn't use malloc
266        ggml_free(ctx_ggml);
267    }
268};
269
270struct tokenized_prompt {
271    std::vector<llama_token> tokens_pos;
272    std::vector<llama_token> tokens_neg;
273    size_t max_seq_len;
274
275    tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
276        const llama_model * model = llama_get_model(ctx);
277        const llama_vocab * vocab = llama_model_get_vocab(model);
278        const bool add_bos = llama_vocab_get_add_bos(vocab);
279        tokens_pos = common_tokenize(ctx, pos, add_bos, true);
280        tokens_neg = common_tokenize(ctx, neg, add_bos, true);
281        max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
282        padding_seq(ctx, tokens_pos, max_seq_len);
283        padding_seq(ctx, tokens_neg, max_seq_len);
284    }
285
286    void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
287        // TODO: customize padding token
288        std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
289        llama_token pad_tok = pad_tokens.back();
290        while (tokens.size() < len) {
291            tokens.push_back(pad_tok);
292        }
293    }
294};
295
296//////////////////////////////////////////////////
297
298template <typename T>
299static std::string to_string(const T & val) {
300    std::stringstream ss;
301    ss << val;
302    return ss.str();
303}
304
305static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) {
306    std::vector<std::string> output;
307    std::ifstream file(path);
308    if (!file.is_open()) {
309        fprintf(stderr, "error: unable to open file: %s\n", path.c_str());
310        exit(1);
311    }
312    std::string line;
313    while (std::getline(file, line)) {
314        bool is_skip = skip_empty_lines && line.empty();
315        if (!is_skip) {
316            string_process_escapes(line);
317            output.push_back(line);
318        }
319    }
320    file.close();
321    return output;
322}
323
324//////////////////////////////////////////////////
325
326static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
327    auto * cb_data = (callback_data *) user_data;
328    static const char * l_out_name = "l_out";
329    const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0;
330
331    if (ask) {
332        return is_l_out;
333    }
334
335    if (!is_l_out || t->ne[1] != cb_data->n_tokens) {
336        return true;
337    }
338
339    // save the tensor to current context
340    cb_data->save_tensor_for_layer(t);
341    return true;
342}
343
344static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
345    llama_memory_clear(llama_get_memory(ctx), true);
346    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
347        fprintf(stderr, "%s : failed to eval\n", __func__);
348        return false;
349    }
350    return true;
351}
352
353static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) {
354    struct gguf_context * ctx = gguf_init_empty();
355
356    const std::string arch = "controlvector";
357    gguf_set_val_str(ctx, "general.architecture", arch.c_str());
358    gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str());
359    gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size());
360
361    for (size_t i = 0; i < v_ctrl.size(); ++i) {
362        gguf_add_tensor(ctx, v_ctrl[i]);
363        print_debug_tensor(v_ctrl[i]);
364        printf("Added tensor: %s\n", v_ctrl[i]->name);
365    }
366
367    printf("%s: writing file...\n", __func__);
368    gguf_write_to_file(ctx, fname.c_str(), false);
369    printf("%s: wrote file '%s'\n", __func__, fname.c_str());
370    gguf_free(ctx);
371}
372
373/**
374 * Load prompt files and completion file.
375 * Then format each pair of prompt + completion to make an entry.
376 */
377static int prepare_entries(common_params & params, train_context & ctx_train) {
378    // load prompts
379    std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
380    std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
381    if (positive_prompts.size() != negative_prompts.size()) {
382        fprintf(stderr, "number of positive and negative prompts must be equal\n");
383        return 1;
384    }
385    if (positive_prompts.empty()) {
386        fprintf(stderr, "must provide at least one prompt pair\n");
387        return 1;
388    }
389    ctx_train.positive_entries = positive_prompts;
390    ctx_train.negative_entries = negative_prompts;
391    return 0;
392}
393
394int main(int argc, char ** argv) {
395    common_params params;
396
397    params.out_file = "control_vector.gguf";
398
399    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
400        return 1;
401    }
402
403    if (params.n_pca_iterations % params.n_pca_batch != 0) {
404        fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n");
405        return 1;
406    }
407
408
409    callback_data cb_data;
410
411    // pass the callback to the backend scheduler
412    // it will be executed for each node during the graph computation
413    params.cb_eval = cb_eval;
414    params.cb_eval_user_data = &cb_data;
415    params.warmup = false;
416
417    print_build_info();
418    llama_backend_init();
419    llama_numa_init(params.numa);
420
421    // load the model to get hparams
422    auto llama_init = common_init_from_params(params);
423
424    auto * model = llama_init->model();
425    auto * ctx   = llama_init->context();
426
427    // int n_ctx = llama_n_ctx(ctx);
428    int n_layers = llama_model_n_layer(model);
429    int n_embd = llama_model_n_embd(model);
430
431    // get model hint param (a.k.a model arch name)
432    char model_hint[128];
433    llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
434
435    // init train_context
436    train_context ctx_train(n_embd, n_layers);
437
438    // load and prepare entries for training
439    prepare_entries(params, ctx_train);
440
441    // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped
442    std::vector<tokenized_prompt> tokenized_prompts;
443    size_t n_total_tokens = 0;
444    for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
445        tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]);
446        n_total_tokens += 2 * t.max_seq_len;
447        tokenized_prompts.push_back(std::move(t));
448    }
449
450    std::cout << "n_total_tokens: " << n_total_tokens << std::endl;
451
452    for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) {
453        bool success = false;
454        tokenized_prompt t = tokenized_prompts[i];
455        cb_data.n_layers = n_layers;
456        cb_data.n_tokens = t.max_seq_len;
457
458        printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n",
459            (int) i+1, (int) ctx_train.positive_entries.size(),
460            tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(),
461            tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(),
462            (int) t.max_seq_len);
463
464        cb_data.is_eval_pos = true;
465        success = get_hidden_layers(ctx, t.tokens_pos);
466        if (!success) break;
467
468        cb_data.is_eval_pos = false;
469        success = get_hidden_layers(ctx, t.tokens_neg);
470        if (!success) break;
471
472        // calculate diff and remove all zero rows
473        auto v_diff_filtered = cb_data.calc_diff();
474
475        // save & concat the filtered v_diff to ctx_train
476        ctx_train.concat_diff_tmp(v_diff_filtered);
477
478        // reset for next iteration
479        cb_data.reset();
480    }
481
482    // done with the model, we can now free it to make gain some memory
483    printf("Done evaluate prompts, unload model...\n");
484
485    bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
486
487    // prepare ctx_train for PCA
488    ctx_train.build_v_diff(use_pca);
489
490    if (use_pca) {
491        // run PCA
492        PCA::pca_params pca_params;
493        pca_params.n_threads    = params.cpuparams.n_threads;
494        pca_params.n_batch      = params.n_pca_batch;
495        pca_params.n_iterations = params.n_pca_iterations;
496        PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
497    } else {
498        // run mean
499        mean::run(ctx_train.v_diff, ctx_train.v_final);
500    }
501
502    // write output vectors to gguf
503    export_gguf(ctx_train.v_final, params.out_file, model_hint);
504
505    llama_backend_free();
506
507    return 0;
508}