diff options
Diffstat (limited to 'llama.cpp/tools/cvector-generator/cvector-generator.cpp')
| -rw-r--r-- | llama.cpp/tools/cvector-generator/cvector-generator.cpp | 508 |
1 files changed, 508 insertions, 0 deletions
diff --git a/llama.cpp/tools/cvector-generator/cvector-generator.cpp b/llama.cpp/tools/cvector-generator/cvector-generator.cpp new file mode 100644 index 0000000..3ba7c52 --- /dev/null +++ b/llama.cpp/tools/cvector-generator/cvector-generator.cpp | |||
| @@ -0,0 +1,508 @@ | |||
| 1 | #include "ggml.h" | ||
| 2 | #include "gguf.h" | ||
| 3 | |||
| 4 | #include "arg.h" | ||
| 5 | #include "common.h" | ||
| 6 | #include "llama.h" | ||
| 7 | #include "pca.hpp" | ||
| 8 | #include "mean.hpp" | ||
| 9 | |||
| 10 | #ifdef GGML_USE_CUDA | ||
| 11 | #include "ggml-cuda.h" | ||
| 12 | #endif | ||
| 13 | |||
| 14 | #ifdef GGML_USE_METAL | ||
| 15 | #include "ggml-metal.h" | ||
| 16 | #endif | ||
| 17 | |||
| 18 | #include <algorithm> | ||
| 19 | #include <climits> | ||
| 20 | #include <cstdio> | ||
| 21 | #include <cstring> | ||
| 22 | #include <fstream> | ||
| 23 | #include <iostream> | ||
| 24 | #include <string> | ||
| 25 | #include <tuple> | ||
| 26 | #include <vector> | ||
| 27 | |||
| 28 | |||
| 29 | ////////////////////////////////////////////////// | ||
| 30 | // utils | ||
| 31 | |||
| 32 | template <class Iter> | ||
| 33 | static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { | ||
| 34 | std::string ret; | ||
| 35 | for (; begin != end; ++begin) { | ||
| 36 | ret += common_token_to_piece(ctx, *begin); | ||
| 37 | } | ||
| 38 | |||
| 39 | return ret; | ||
| 40 | } | ||
| 41 | |||
| 42 | static void print_usage(int, char ** argv) { | ||
| 43 | printf("\nexample usage:\n"); | ||
| 44 | printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]); | ||
| 45 | printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]); | ||
| 46 | printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]); | ||
| 47 | printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]); | ||
| 48 | printf("\n"); | ||
| 49 | } | ||
| 50 | |||
| 51 | ////////////////////////////////////////////////// | ||
| 52 | |||
| 53 | |||
| 54 | // cb_eval is reused for each pair of positive - negative prompt | ||
| 55 | struct callback_data { | ||
| 56 | ggml_context * ctx_ggml = nullptr; // holds v_pos, v_neg, v_diff_filtered | ||
| 57 | |||
| 58 | int n_layers = 0; | ||
| 59 | int n_tokens = 0; | ||
| 60 | bool is_eval_pos = true; | ||
| 61 | |||
| 62 | // each element of the vector correspond to one layer | ||
| 63 | std::vector<struct ggml_tensor *> v_pos; // vector of matrices of size [n_embd, n_tokens] | ||
| 64 | std::vector<struct ggml_tensor *> v_neg; // vector of matrices of size [n_embd, n_tokens] | ||
| 65 | std::vector<struct ggml_tensor *> v_diff_filtered; // vector of matrices of size [n_embd, n_nonzero_rows]. NOTE: n_nonzero_rows maybe different for each layer | ||
| 66 | |||
| 67 | // save a tensor into either v_pos or v_neg (decided by is_eval_pos) | ||
| 68 | void save_tensor_for_layer(struct ggml_tensor * t) { | ||
| 69 | GGML_ASSERT(t->type == GGML_TYPE_F32); | ||
| 70 | |||
| 71 | if (ctx_ggml == nullptr) { | ||
| 72 | // alloc a new ctx_ggml if needed | ||
| 73 | struct ggml_init_params params_ggml = { | ||
| 74 | /*.mem_size =*/ ggml_tensor_overhead() * n_layers * 3u, | ||
| 75 | /*.mem_buffer =*/ NULL, | ||
| 76 | /*.no_alloc =*/ true, | ||
| 77 | }; | ||
| 78 | ctx_ggml = ggml_init(params_ggml); | ||
| 79 | } | ||
| 80 | |||
| 81 | // copy tensor data | ||
| 82 | auto n_bytes = ggml_nbytes(t); | ||
| 83 | struct ggml_tensor * t_layer = ggml_new_tensor_2d(ctx_ggml, t->type, t->ne[0], t->ne[1]); | ||
| 84 | t_layer->data = malloc(n_bytes); // TODO @ngxson : get rid of this malloc somehow | ||
| 85 | ggml_backend_tensor_get(t, t_layer->data, 0, n_bytes); | ||
| 86 | ggml_set_name(t_layer, ggml_get_name(t)); | ||
| 87 | //print_debug_tensor(t_layer); | ||
| 88 | |||
| 89 | if (is_eval_pos) { | ||
| 90 | v_pos.push_back(t_layer); | ||
| 91 | } else { | ||
| 92 | v_neg.push_back(t_layer); | ||
| 93 | } | ||
| 94 | } | ||
| 95 | |||
| 96 | // calculate diff (v_pos - v_neg) and place the result back to v_pos | ||
| 97 | // all zero rows in the diff tensor will also be removed | ||
| 98 | // NOTE: final layer is ignored. we only have (n_layers - 1) to process | ||
| 99 | std::vector<struct ggml_tensor *> calc_diff() { | ||
| 100 | for (float il = 0; il < v_pos.size(); il++) { | ||
| 101 | float * a = (float *) v_pos[il]->data; | ||
| 102 | float * b = (float *) v_neg[il]->data; | ||
| 103 | size_t n_elem = ggml_nelements(v_pos[il]); | ||
| 104 | for (size_t j = 0; j < n_elem; j++) { | ||
| 105 | a[j] -= b[j]; | ||
| 106 | } | ||
| 107 | //print_debug_tensor(v_pos[i]); | ||
| 108 | auto diff_filtered = filter_nonzero_rows(v_pos[il]); | ||
| 109 | v_diff_filtered.push_back(diff_filtered); | ||
| 110 | } | ||
| 111 | return v_diff_filtered; // for convinient, we return the result std::vector | ||
| 112 | } | ||
| 113 | |||
| 114 | // delete zero rows from a given 2D tensor | ||
| 115 | struct ggml_tensor * filter_nonzero_rows(struct ggml_tensor * a) { | ||
| 116 | //printf("filter_nonzero_rows\n"); | ||
| 117 | auto is_row_all_zeros = [](struct ggml_tensor * t, int row, float eps) -> bool { | ||
| 118 | // check if given row containing all zero elements | ||
| 119 | int n_cols = t->ne[0]; // hint: should be equal to n_embd | ||
| 120 | for (int col = 0; col < n_cols; ++col) { | ||
| 121 | if (ggml_get_f32_nd(t, col, row, 0, 0) > eps) { | ||
| 122 | return false; | ||
| 123 | } | ||
| 124 | } | ||
| 125 | return true; | ||
| 126 | }; | ||
| 127 | std::vector<int> rows_to_copy; // the idx of non-zero cols (to be copied to row of diff_filtered) | ||
| 128 | for (int i_row = 0; i_row < a->ne[1]; i_row++) { | ||
| 129 | if (!is_row_all_zeros(a, i_row, 1e-6)) { | ||
| 130 | rows_to_copy.push_back(i_row); | ||
| 131 | } | ||
| 132 | } | ||
| 133 | |||
| 134 | // get "n_nonzero_rows" for the output "diff_filtered" | ||
| 135 | int n_nonzero_rows = rows_to_copy.size(); | ||
| 136 | //printf("n_nonzero_rows: %d\n", n_nonzero_rows); | ||
| 137 | int n_embd = a->ne[0]; | ||
| 138 | GGML_ASSERT(n_nonzero_rows > 0); | ||
| 139 | |||
| 140 | // diff_filtered: [n_embd, n_nonzero_rows] | ||
| 141 | struct ggml_tensor * diff_filtered = ggml_new_tensor_2d( | ||
| 142 | ctx_ggml, GGML_TYPE_F32, n_embd, n_nonzero_rows); | ||
| 143 | ggml_format_name(diff_filtered, "diff_filtered_%s", a->name); | ||
| 144 | diff_filtered->data = malloc(ggml_nbytes(diff_filtered)); | ||
| 145 | |||
| 146 | // copy non-zero rows | ||
| 147 | for (int dest_row = 0; dest_row < n_nonzero_rows; dest_row++) { | ||
| 148 | int src_row = rows_to_copy[dest_row]; | ||
| 149 | for (int i = 0; i < n_embd; i++) { | ||
| 150 | float src_elem = ggml_get_f32_nd(a, i, src_row, 0, 0); | ||
| 151 | ggml_set_f32_nd(diff_filtered, i, dest_row, 0, 0, src_elem); | ||
| 152 | } | ||
| 153 | } | ||
| 154 | |||
| 155 | //print_debug_tensor(diff_filtered); | ||
| 156 | |||
| 157 | return diff_filtered; | ||
| 158 | } | ||
| 159 | |||
| 160 | // we don't implement destructor, because we want to reuse callback_data. we just want to free the tensors | ||
| 161 | void reset() { | ||
| 162 | for (auto ptr : v_pos) free(ptr->data); | ||
| 163 | for (auto ptr : v_neg) free(ptr->data); | ||
| 164 | for (auto ptr : v_diff_filtered) free(ptr->data); | ||
| 165 | v_pos.clear(); | ||
| 166 | v_neg.clear(); | ||
| 167 | v_diff_filtered.clear(); | ||
| 168 | if (ctx_ggml) { | ||
| 169 | ggml_free(ctx_ggml); | ||
| 170 | } | ||
| 171 | ctx_ggml = nullptr; | ||
| 172 | } | ||
| 173 | }; | ||
| 174 | |||
| 175 | /** | ||
| 176 | * process_ctx is used to store the ggml context for pre-post processing the diff vectors | ||
| 177 | * in short, input => v_diff and output => v_final | ||
| 178 | */ | ||
| 179 | struct train_context { | ||
| 180 | ggml_context * ctx_ggml; | ||
| 181 | int n_embd; | ||
| 182 | int n_layers; | ||
| 183 | |||
| 184 | /* pair of prompts to be used for generating final vector */ | ||
| 185 | std::vector<std::string> positive_entries; | ||
| 186 | std::vector<std::string> negative_entries; | ||
| 187 | |||
| 188 | // each element of the vector correspond to one layer | ||
| 189 | // NOTE: the last layer is discard. therefore, we will have (n_layers - 1) elements here | ||
| 190 | // NOTE (2): v_diff is transposed from v_diff_tmp | ||
| 191 | std::vector<struct ggml_tensor *> v_diff; // vector of matrices of size [m, n_embd] where m ~ n_tokens * n_completions (v_diff contains no zero-rows) | ||
| 192 | std::vector<struct ggml_tensor *> v_final; // vector of vectors of size [n_embd] to be written to file | ||
| 193 | |||
| 194 | // to easily re-alloc when concat v_diff, we temporary store v_diff in a vector instead of a tensor | ||
| 195 | // v_diff_tmp will get converted unto v_diff later on | ||
| 196 | std::vector<std::vector<uint8_t>> v_diff_tmp; | ||
| 197 | |||
| 198 | train_context(int n_embd_, int n_layers_) { | ||
| 199 | n_embd = n_embd_; | ||
| 200 | n_layers = n_layers_; | ||
| 201 | struct ggml_init_params params_ggml = { | ||
| 202 | /*.mem_size =*/ ggml_tensor_overhead() * (n_layers - 1) * 2u, | ||
| 203 | /*.mem_buffer =*/ NULL, | ||
| 204 | /*.no_alloc =*/ true, | ||
| 205 | }; | ||
| 206 | ctx_ggml = ggml_init(params_ggml); | ||
| 207 | for (int il = 0; il < n_layers - 1; il++) { | ||
| 208 | std::vector<uint8_t> empty; | ||
| 209 | v_diff_tmp.push_back(empty); | ||
| 210 | auto t = ggml_new_tensor_1d(ctx_ggml, GGML_TYPE_F32, n_embd); | ||
| 211 | t->data = malloc(ggml_nbytes(t)); // TODO: get rid of malloc if possible | ||
| 212 | v_final.push_back(t); | ||
| 213 | } | ||
| 214 | } | ||
| 215 | |||
| 216 | // add new rows into existing tensor in v_diff_tmp | ||
| 217 | void concat_diff_tmp(const std::vector<struct ggml_tensor *> & diff_filtered) { | ||
| 218 | GGML_ASSERT((int) diff_filtered.size() == n_layers - 1); | ||
| 219 | for (int il = 0; il < n_layers - 1; il++) { | ||
| 220 | auto t = diff_filtered[il]; | ||
| 221 | auto & diff_tmp = v_diff_tmp[il]; | ||
| 222 | size_t curr_size = diff_tmp.size(); | ||
| 223 | diff_tmp.resize(curr_size + ggml_nbytes(t)); | ||
| 224 | memcpy(diff_tmp.data() + curr_size, t->data, ggml_nbytes(t)); | ||
| 225 | } | ||
| 226 | } | ||
| 227 | |||
| 228 | // build the v_diff tensors from v_diff_tmp (v_diff need to be transposed) | ||
| 229 | // TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method | ||
| 230 | void build_v_diff(bool transpose) { | ||
| 231 | printf("build_v_diff\n"); | ||
| 232 | for (int il = 0; il < n_layers - 1; il++) { | ||
| 233 | auto & diff_tmp = v_diff_tmp[il]; | ||
| 234 | int n_elem = diff_tmp.size() / sizeof(float); | ||
| 235 | GGML_ASSERT(n_elem % n_embd == 0); | ||
| 236 | int n_rows = n_elem / n_embd; | ||
| 237 | struct ggml_tensor * diff = transpose | ||
| 238 | ? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd) | ||
| 239 | : ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows); | ||
| 240 | ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str()); | ||
| 241 | diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible | ||
| 242 | if (transpose) { | ||
| 243 | // copy data & transpose | ||
| 244 | float * arr = (float *) diff_tmp.data(); | ||
| 245 | for (int ir = 0; ir < n_rows; ++ir) { | ||
| 246 | for (int ic = 0; ic < n_embd; ++ic) { | ||
| 247 | float f = arr[ir*n_embd + ic]; | ||
| 248 | ggml_set_f32_nd(diff, ir, ic, 0, 0, f); | ||
| 249 | } | ||
| 250 | } | ||
| 251 | } else { | ||
| 252 | // only copy | ||
| 253 | memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff)); | ||
| 254 | } | ||
| 255 | v_diff.push_back(diff); | ||
| 256 | print_debug_tensor(diff); | ||
| 257 | // free memory of diff_tmp | ||
| 258 | diff_tmp.resize(0); | ||
| 259 | } | ||
| 260 | } | ||
| 261 | |||
| 262 | ~train_context() { | ||
| 263 | for (auto ptr : v_final) free(ptr->data); | ||
| 264 | for (auto ptr : v_diff) free(ptr->data); | ||
| 265 | // no need to free v_diff_tmp, since we didn't use malloc | ||
| 266 | ggml_free(ctx_ggml); | ||
| 267 | } | ||
| 268 | }; | ||
| 269 | |||
| 270 | struct tokenized_prompt { | ||
| 271 | std::vector<llama_token> tokens_pos; | ||
| 272 | std::vector<llama_token> tokens_neg; | ||
| 273 | size_t max_seq_len; | ||
| 274 | |||
| 275 | tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { | ||
| 276 | const llama_model * model = llama_get_model(ctx); | ||
| 277 | const llama_vocab * vocab = llama_model_get_vocab(model); | ||
| 278 | const bool add_bos = llama_vocab_get_add_bos(vocab); | ||
| 279 | tokens_pos = common_tokenize(ctx, pos, add_bos, true); | ||
| 280 | tokens_neg = common_tokenize(ctx, neg, add_bos, true); | ||
| 281 | max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); | ||
| 282 | padding_seq(ctx, tokens_pos, max_seq_len); | ||
| 283 | padding_seq(ctx, tokens_neg, max_seq_len); | ||
| 284 | } | ||
| 285 | |||
| 286 | void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) { | ||
| 287 | // TODO: customize padding token | ||
| 288 | std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false); | ||
| 289 | llama_token pad_tok = pad_tokens.back(); | ||
| 290 | while (tokens.size() < len) { | ||
| 291 | tokens.push_back(pad_tok); | ||
| 292 | } | ||
| 293 | } | ||
| 294 | }; | ||
| 295 | |||
| 296 | ////////////////////////////////////////////////// | ||
| 297 | |||
| 298 | template <typename T> | ||
| 299 | static std::string to_string(const T & val) { | ||
| 300 | std::stringstream ss; | ||
| 301 | ss << val; | ||
| 302 | return ss.str(); | ||
| 303 | } | ||
| 304 | |||
| 305 | static std::vector<std::string> ctrlvec_load_prompt_file(std::string path, bool skip_empty_lines) { | ||
| 306 | std::vector<std::string> output; | ||
| 307 | std::ifstream file(path); | ||
| 308 | if (!file.is_open()) { | ||
| 309 | fprintf(stderr, "error: unable to open file: %s\n", path.c_str()); | ||
| 310 | exit(1); | ||
| 311 | } | ||
| 312 | std::string line; | ||
| 313 | while (std::getline(file, line)) { | ||
| 314 | bool is_skip = skip_empty_lines && line.empty(); | ||
| 315 | if (!is_skip) { | ||
| 316 | string_process_escapes(line); | ||
| 317 | output.push_back(line); | ||
| 318 | } | ||
| 319 | } | ||
| 320 | file.close(); | ||
| 321 | return output; | ||
| 322 | } | ||
| 323 | |||
| 324 | ////////////////////////////////////////////////// | ||
| 325 | |||
| 326 | static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { | ||
| 327 | auto * cb_data = (callback_data *) user_data; | ||
| 328 | static const char * l_out_name = "l_out"; | ||
| 329 | const bool is_l_out = strncmp(t->name, l_out_name, strlen(l_out_name)) == 0; | ||
| 330 | |||
| 331 | if (ask) { | ||
| 332 | return is_l_out; | ||
| 333 | } | ||
| 334 | |||
| 335 | if (!is_l_out || t->ne[1] != cb_data->n_tokens) { | ||
| 336 | return true; | ||
| 337 | } | ||
| 338 | |||
| 339 | // save the tensor to current context | ||
| 340 | cb_data->save_tensor_for_layer(t); | ||
| 341 | return true; | ||
| 342 | } | ||
| 343 | |||
| 344 | static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) { | ||
| 345 | llama_memory_clear(llama_get_memory(ctx), true); | ||
| 346 | if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { | ||
| 347 | fprintf(stderr, "%s : failed to eval\n", __func__); | ||
| 348 | return false; | ||
| 349 | } | ||
| 350 | return true; | ||
| 351 | } | ||
| 352 | |||
| 353 | static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const std::string fname, const std::string model_hint) { | ||
| 354 | struct gguf_context * ctx = gguf_init_empty(); | ||
| 355 | |||
| 356 | const std::string arch = "controlvector"; | ||
| 357 | gguf_set_val_str(ctx, "general.architecture", arch.c_str()); | ||
| 358 | gguf_set_val_str(ctx, (arch + ".model_hint").c_str(), model_hint.c_str()); | ||
| 359 | gguf_set_val_i32(ctx, (arch + ".layer_count").c_str(), v_ctrl.size()); | ||
| 360 | |||
| 361 | for (size_t i = 0; i < v_ctrl.size(); ++i) { | ||
| 362 | gguf_add_tensor(ctx, v_ctrl[i]); | ||
| 363 | print_debug_tensor(v_ctrl[i]); | ||
| 364 | printf("Added tensor: %s\n", v_ctrl[i]->name); | ||
| 365 | } | ||
| 366 | |||
| 367 | printf("%s: writing file...\n", __func__); | ||
| 368 | gguf_write_to_file(ctx, fname.c_str(), false); | ||
| 369 | printf("%s: wrote file '%s'\n", __func__, fname.c_str()); | ||
| 370 | gguf_free(ctx); | ||
| 371 | } | ||
| 372 | |||
| 373 | /** | ||
| 374 | * Load prompt files and completion file. | ||
| 375 | * Then format each pair of prompt + completion to make an entry. | ||
| 376 | */ | ||
| 377 | static int prepare_entries(common_params & params, train_context & ctx_train) { | ||
| 378 | // load prompts | ||
| 379 | std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true); | ||
| 380 | std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true); | ||
| 381 | if (positive_prompts.size() != negative_prompts.size()) { | ||
| 382 | fprintf(stderr, "number of positive and negative prompts must be equal\n"); | ||
| 383 | return 1; | ||
| 384 | } | ||
| 385 | if (positive_prompts.empty()) { | ||
| 386 | fprintf(stderr, "must provide at least one prompt pair\n"); | ||
| 387 | return 1; | ||
| 388 | } | ||
| 389 | ctx_train.positive_entries = positive_prompts; | ||
| 390 | ctx_train.negative_entries = negative_prompts; | ||
| 391 | return 0; | ||
| 392 | } | ||
| 393 | |||
| 394 | int main(int argc, char ** argv) { | ||
| 395 | common_params params; | ||
| 396 | |||
| 397 | params.out_file = "control_vector.gguf"; | ||
| 398 | |||
| 399 | if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { | ||
| 400 | return 1; | ||
| 401 | } | ||
| 402 | |||
| 403 | if (params.n_pca_iterations % params.n_pca_batch != 0) { | ||
| 404 | fprintf(stderr, "PCA iterations must by multiply of PCA batch size\n"); | ||
| 405 | return 1; | ||
| 406 | } | ||
| 407 | |||
| 408 | |||
| 409 | callback_data cb_data; | ||
| 410 | |||
| 411 | // pass the callback to the backend scheduler | ||
| 412 | // it will be executed for each node during the graph computation | ||
| 413 | params.cb_eval = cb_eval; | ||
| 414 | params.cb_eval_user_data = &cb_data; | ||
| 415 | params.warmup = false; | ||
| 416 | |||
| 417 | print_build_info(); | ||
| 418 | llama_backend_init(); | ||
| 419 | llama_numa_init(params.numa); | ||
| 420 | |||
| 421 | // load the model to get hparams | ||
| 422 | auto llama_init = common_init_from_params(params); | ||
| 423 | |||
| 424 | auto * model = llama_init->model(); | ||
| 425 | auto * ctx = llama_init->context(); | ||
| 426 | |||
| 427 | // int n_ctx = llama_n_ctx(ctx); | ||
| 428 | int n_layers = llama_model_n_layer(model); | ||
| 429 | int n_embd = llama_model_n_embd(model); | ||
| 430 | |||
| 431 | // get model hint param (a.k.a model arch name) | ||
| 432 | char model_hint[128]; | ||
| 433 | llama_model_meta_val_str(model, "general.architecture", model_hint, 128); | ||
| 434 | |||
| 435 | // init train_context | ||
| 436 | train_context ctx_train(n_embd, n_layers); | ||
| 437 | |||
| 438 | // load and prepare entries for training | ||
| 439 | prepare_entries(params, ctx_train); | ||
| 440 | |||
| 441 | // we have to pretokenize everything because otherwise we don't know how much overhead to allocate ctx_diffs_wrapped | ||
| 442 | std::vector<tokenized_prompt> tokenized_prompts; | ||
| 443 | size_t n_total_tokens = 0; | ||
| 444 | for (size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { | ||
| 445 | tokenized_prompt t(ctx, ctx_train.positive_entries[i], ctx_train.negative_entries[i]); | ||
| 446 | n_total_tokens += 2 * t.max_seq_len; | ||
| 447 | tokenized_prompts.push_back(std::move(t)); | ||
| 448 | } | ||
| 449 | |||
| 450 | std::cout << "n_total_tokens: " << n_total_tokens << std::endl; | ||
| 451 | |||
| 452 | for(size_t i = 0; i < ctx_train.positive_entries.size(); ++i) { | ||
| 453 | bool success = false; | ||
| 454 | tokenized_prompt t = tokenized_prompts[i]; | ||
| 455 | cb_data.n_layers = n_layers; | ||
| 456 | cb_data.n_tokens = t.max_seq_len; | ||
| 457 | |||
| 458 | printf("Evaluating prompt[%d/%d]: \"%s\" - \"%s\" (%d tokens)\n", | ||
| 459 | (int) i+1, (int) ctx_train.positive_entries.size(), | ||
| 460 | tokens_to_str(ctx, t.tokens_pos.cbegin(), t.tokens_pos.cend()).c_str(), | ||
| 461 | tokens_to_str(ctx, t.tokens_neg.cbegin(), t.tokens_neg.cend()).c_str(), | ||
| 462 | (int) t.max_seq_len); | ||
| 463 | |||
| 464 | cb_data.is_eval_pos = true; | ||
| 465 | success = get_hidden_layers(ctx, t.tokens_pos); | ||
| 466 | if (!success) break; | ||
| 467 | |||
| 468 | cb_data.is_eval_pos = false; | ||
| 469 | success = get_hidden_layers(ctx, t.tokens_neg); | ||
| 470 | if (!success) break; | ||
| 471 | |||
| 472 | // calculate diff and remove all zero rows | ||
| 473 | auto v_diff_filtered = cb_data.calc_diff(); | ||
| 474 | |||
| 475 | // save & concat the filtered v_diff to ctx_train | ||
| 476 | ctx_train.concat_diff_tmp(v_diff_filtered); | ||
| 477 | |||
| 478 | // reset for next iteration | ||
| 479 | cb_data.reset(); | ||
| 480 | } | ||
| 481 | |||
| 482 | // done with the model, we can now free it to make gain some memory | ||
| 483 | printf("Done evaluate prompts, unload model...\n"); | ||
| 484 | |||
| 485 | bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA; | ||
| 486 | |||
| 487 | // prepare ctx_train for PCA | ||
| 488 | ctx_train.build_v_diff(use_pca); | ||
| 489 | |||
| 490 | if (use_pca) { | ||
| 491 | // run PCA | ||
| 492 | PCA::pca_params pca_params; | ||
| 493 | pca_params.n_threads = params.cpuparams.n_threads; | ||
| 494 | pca_params.n_batch = params.n_pca_batch; | ||
| 495 | pca_params.n_iterations = params.n_pca_iterations; | ||
| 496 | PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final); | ||
| 497 | } else { | ||
| 498 | // run mean | ||
| 499 | mean::run(ctx_train.v_diff, ctx_train.v_final); | ||
| 500 | } | ||
| 501 | |||
| 502 | // write output vectors to gguf | ||
| 503 | export_gguf(ctx_train.v_final, params.out_file, model_hint); | ||
| 504 | |||
| 505 | llama_backend_free(); | ||
| 506 | |||
| 507 | return 0; | ||
| 508 | } | ||
