llmnpc - llama.cpp/tools/imatrix/imatrix.cpp

Path: llmnpc / llama.cpp / tools / imatrix / imatrix.cpp (raw)
   1#include "arg.h"
   2#include "common.h"
   3#include "log.h"
   4#include "llama.h"
   5#include "gguf.h"
   6
   7#include <algorithm>
   8#include <chrono>
   9#include <cmath>
  10#include <cstdio>
  11#include <cstring>
  12#include <ctime>
  13#include <thread>
  14#include <mutex>
  15#include <vector>
  16#include <fstream>
  17#include <unordered_map>
  18#include <map>
  19#include <regex>
  20#include <numeric>
  21
  22#if defined(_MSC_VER)
  23#pragma warning(disable: 4244 4267) // possible loss of data
  24#endif
  25
  26static void print_usage(int, char ** argv) {
  27    LOG("\nexample usage:\n");
  28    LOG("\n    %s \\\n"
  29            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \\\n"
  30            "       [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
  31            "       [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
  32            "       [--show-statistics] [...]\n" , argv[0]);
  33    LOG("\n");
  34}
  35
  36static const char * const LLM_KV_IMATRIX_DATASETS    = "imatrix.datasets";
  37static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
  38static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
  39
  40struct Stats {
  41    std::vector<float>   values;
  42    std::vector<int64_t> counts;
  43};
  44
  45struct tensor_statistics {
  46    std::string tensor;
  47    Stats stats;
  48    float total_sqract = 0.0f;
  49    float mean_sqract  = 0.0f;
  50    float max_sqract   = 0.0f;
  51    float min_sqract   = 0.0f;
  52    int elements       = 0;
  53    float stddev       = 0.0f;
  54    float active       = 0.0f;
  55    float entropy      = 0.0f;
  56    float zd           = 0.0f;
  57    float cossim       = 0.0f;
  58};
  59
  60class IMatrixCollector {
  61public:
  62    IMatrixCollector() = default;
  63    void set_params(common_params params) { m_params = std::move(params); }
  64    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
  65    void save_imatrix_legacy(int32_t ncall = -1) const;
  66    void save_imatrix(int32_t n_chunk = -1) const;
  67    bool load_imatrix_legacy(const char * fname);
  68    bool load_imatrix(const char * file_name);
  69    const std::unordered_map<std::string, Stats> & get_mstats() const { return m_stats; }
  70private:
  71    std::unordered_map<std::string, Stats> m_stats;
  72    common_params                          m_params;
  73    std::mutex                             m_mutex;
  74    std::vector<std::string>               m_datasets;
  75    int32_t                                m_last_chunk = 0;
  76    std::vector<char>                      m_src1_data;
  77    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
  78};
  79
  80// remove any prefix and suffixes from the name
  81// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
  82static std::string filter_tensor_name(const char * name) {
  83    std::string wname;
  84    const char * p = strchr(name, '#');
  85    if (p != NULL) {
  86        p = p + 1;
  87        const char * q = strchr(p, '#');
  88        if (q != NULL) {
  89            wname = std::string(p, q - p);
  90        } else {
  91            wname = p;
  92        }
  93    } else {
  94        wname = name;
  95    }
  96    return wname;
  97}
  98
  99static void process_tensor_name(const std::string & input, std::string & layer, std::string & tensor) {
 100    std::vector<std::string> name;
 101    std::istringstream stream(input);
 102    std::string item;
 103
 104    while (std::getline(stream, item, '.')) {
 105        name.push_back(item);
 106    }
 107    for (size_t i = 0; i < name.size(); ++i) {
 108        if (name[i] == "blk" && i + 1 < name.size()) {
 109            layer = name[i + 1];
 110            break;
 111        }
 112    }
 113    for (size_t i = 0; i < name.size(); ++i) {
 114        if (name[i] == "weight" && i > 0) {
 115            tensor = name[i - 1];
 116            break;
 117        }
 118    }
 119
 120    if (tensor.empty()) {
 121        tensor = input;
 122    }
 123    if (layer.empty()) {
 124        layer = "-";
 125    }
 126}
 127
 128static void compute_statistics(std::vector<tensor_statistics> & tstats, const std::string & name, const Stats & e) {
 129    if (e.values.size() % e.counts.size() != 0) {
 130        LOG_ERR("%s: activation size mismatch for tensor %s (%zu vs %zu)\n", __func__, name.c_str(), e.counts.size(), e.values.size());
 131        return;
 132    }
 133    if (e.counts.empty()) {
 134        LOG_ERR("%s: there are no activations for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str());
 135        return;
 136    }
 137
 138    const int n_mat = e.counts.size();
 139    const int row_size = e.values.size() / n_mat;
 140
 141    std::vector<float> activations;
 142    activations.reserve(e.values.size());
 143
 144    for (int i = 0; i < n_mat; ++i) {
 145        for (int j = 0; j < row_size; ++j) {
 146            activations.push_back(e.values[i*row_size + j] / e.counts[i]);
 147        }
 148    }
 149
 150    const float act_total     = std::accumulate(activations.begin(), activations.end(), 0.0f);
 151    const float act_max       = *std::max_element(activations.begin(), activations.end());
 152    const float act_min       = *std::min_element(activations.begin(), activations.end());
 153    const float act_mean      = act_total / activations.size();
 154    const float act_sqr_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f);
 155    const float act_var       = (act_sqr_total / activations.size()) - (act_mean * act_mean);
 156    const float act_dev       = std::sqrt(std::max(0.0f, act_var));
 157    float threshold           = 1e-5f;
 158    const int inactive_count  = std::count_if(activations.begin(), activations.end(),
 159                                               [threshold](const float v) { return fabsf(v) <= threshold; });
 160    const float active_ratio  = 1 - static_cast<float>(inactive_count) / activations.size();
 161
 162    float entropy = 0;
 163    if (act_total > 0) {
 164        for (const auto act : activations) {
 165            if (const float p = act / act_total; p > 0) {
 166                entropy -= p * std::log2(p);
 167            }
 168        }
 169    }
 170
 171    int z_score = 0;
 172    if (act_dev > 0.0f) {
 173        for (const auto act : activations) {
 174            if (const float p = (act - act_mean) / act_dev; p > 1) {
 175                z_score++;
 176            }
 177        }
 178    }
 179
 180    auto & ts = tstats.emplace_back();
 181    ts.tensor     = name;
 182    ts.stats      = e;
 183    ts.total_sqract = act_total;
 184    ts.mean_sqract  = act_mean;
 185    ts.max_sqract   = act_max;
 186    ts.min_sqract   = act_min;
 187    ts.elements   = static_cast<int>(activations.size());
 188    ts.stddev     = act_dev;
 189    ts.active     = active_ratio;
 190    ts.entropy    = entropy;
 191    ts.zd         = static_cast<float>(z_score) / ts.elements;
 192}
 193
 194static void compute_cossim(std::vector<tensor_statistics> & tstats) {
 195    static const std::regex pattern(R"(blk\.(\d+)\.)");
 196    for (auto & ts : tstats) {
 197        if (std::smatch match; std::regex_search(ts.tensor, match, pattern)) {
 198            const int blk = std::stoi(match[1]);
 199            std::string tname(ts.tensor);
 200            tname.replace(match.position(1), match.length(1), std::to_string(blk-1));
 201            auto prev = std::find_if(tstats.begin(), tstats.end(),
 202                [tname](const tensor_statistics & t) { return t.tensor == tname; });
 203            if (prev != tstats.end()) {
 204                const float dp = std::inner_product(ts.stats.values.begin(), ts.stats.values.end(),
 205                    prev->stats.values.begin(), 0.0f);
 206                const float curr_mag = std::sqrt(std::inner_product(ts.stats.values.begin(), ts.stats.values.end(),
 207                    ts.stats.values.begin(), 0.0f));
 208                const float prev_mag = std::sqrt(std::inner_product(prev->stats.values.begin(), prev->stats.values.end(),
 209                    prev->stats.values.begin(), 0.0f));
 210                const float cs = dp / (curr_mag * prev_mag);
 211                ts.cossim = cs;
 212            }
 213        } else {
 214            ts.cossim = 0;
 215        }
 216    }
 217}
 218
 219bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
 220    GGML_UNUSED(user_data);
 221
 222    const struct ggml_tensor * src0 = t->src[0];
 223    const struct ggml_tensor * src1 = t->src[1];
 224    std::string wname = filter_tensor_name(src0->name);
 225
 226    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
 227
 228    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
 229    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
 230    if (ask) {
 231        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
 232        if (t->op != GGML_OP_MUL_MAT) return false;
 233        // why are small batches ignored (<16 tokens)?
 234        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
 235        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
 236        return true;
 237    }
 238
 239    std::lock_guard<std::mutex> lock(m_mutex);
 240
 241    // copy the data from the GPU memory if needed
 242    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
 243
 244    if (!is_host) {
 245        const size_t src1_nbytes = ggml_nbytes(src1);
 246        m_src1_data.resize(src1_nbytes);
 247        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, src1_nbytes);
 248    }
 249
 250    const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
 251    GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
 252
 253    // this has been adapted to the new format of storing merged experts in a single 3d tensor
 254    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
 255    if (t->op == GGML_OP_MUL_MAT_ID) {
 256        //   ids  -> [n_experts_used, n_tokens]
 257        //   src1 -> [cols, n_expert_used, n_tokens]
 258        const ggml_tensor * ids = t->src[2];
 259        const int64_t n_as = src0->ne[2];
 260        const int64_t n_ids = ids->ne[0];
 261
 262        // the top-k selected expert ids are stored in the ids tensor
 263        // for simplicity, always copy ids to host, because it is small
 264        // take into account that ids is not contiguous!
 265
 266        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
 267
 268        // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
 269        if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
 270            LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
 271            GGML_ASSERT(false);
 272        }
 273
 274        m_ids.resize(ggml_nbytes(ids));
 275        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
 276
 277        auto & e = m_stats[wname];
 278
 279        if (e.counts.size() == 1 && n_as > 1) {
 280            // broadcast, when loading an old imatrix
 281            e.counts.resize(n_as, e.counts[0]);
 282        }
 283        if (e.values.empty()) {
 284            e.values.resize(src1->ne[0]*n_as, 0);
 285            e.counts.resize(n_as, 0);
 286        }
 287        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
 288            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0]*n_as));
 289            exit(1); //GGML_ABORT("fatal error");
 290        }
 291        else if (e.counts.size() != (size_t)n_as) {
 292            LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_as);
 293            exit(1); //GGML_ABORT("fatal error");
 294        }
 295        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
 296        // loop over all possible experts, regardless if they are used or not in the batch
 297        for (int64_t ex = 0; ex < n_as; ++ex) {
 298            size_t e_start = ex*src1->ne[0];
 299
 300            for (int64_t idx = 0; idx < n_ids; ++idx) {
 301                for (int64_t row = 0; row < src1->ne[2]; ++row) {
 302                    const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
 303
 304                    GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
 305
 306                    if (excur != ex) continue;
 307
 308                    const int64_t i11 = idx % src1->ne[1];
 309                    const int64_t i12 = row;
 310                    const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]);
 311
 312                    e.counts[ex]++;
 313
 314                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
 315                        e.values[e_start + j] += x[j] * x[j];
 316                        if (!std::isfinite((float)e.values[e_start + j])) {
 317                            LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
 318                            exit(1);
 319                        }
 320                    }
 321                }
 322            }
 323            const int32_t n_chunk = e.counts[ex] / chunk_size;
 324            if (n_chunk > m_last_chunk) {
 325                const int32_t chunk_step = n_chunk - m_last_chunk;
 326                m_last_chunk = n_chunk;
 327                if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
 328                    save_imatrix();
 329                }
 330                if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
 331                    save_imatrix(m_last_chunk);
 332                }
 333            }
 334        }
 335    } else {
 336        auto & e = m_stats[wname];
 337        const int64_t n_mat = src0->ne[2] * src0->ne[3];
 338
 339        // use a single count per dense tensor
 340        // (necessary when merging older GGUF-imatrix files with 3d tensors)
 341        if (e.counts.size() > 1) {
 342            bool all_equal = true;
 343            for (size_t i = 1; i < e.counts.size(); ++i) {
 344                if (e.counts[0] != e.counts[i]) {
 345                    all_equal = false;
 346                    break;
 347                }
 348            }
 349            if (all_equal) {
 350                e.counts.resize(1);
 351            }
 352        }
 353        if (e.values.empty()) {
 354            e.values.resize(src1->ne[0] * n_mat, 0);
 355            e.counts.resize(1, 0);
 356        }
 357        else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
 358            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
 359            exit(1); //GGML_ABORT("fatal error");
 360        }
 361        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
 362
 363        for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
 364            for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
 365                // handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D
 366                const int64_t mat_id = (i3 % src0->ne[3]) * src0->ne[2] + (i2 % src0->ne[2]);
 367                const int64_t mat_start = mat_id * src1->ne[0];
 368
 369                for (int64_t row = 0; row < src1->ne[1]; ++row) {
 370                    const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]);
 371                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
 372                        e.values[mat_start + j] += x[j] * x[j];
 373                        if (!std::isfinite((float)e.values[j])) {
 374                            LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str());
 375                            exit(1);
 376                        }
 377                    }
 378                }
 379            }
 380        }
 381        // only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT
 382        for (size_t i = 0; i < e.counts.size(); ++i) {
 383            e.counts[i] += ggml_nrows(src1) / n_mat;
 384            const int32_t n_chunk = e.counts[i] / chunk_size;
 385            if (n_chunk > m_last_chunk) {
 386                const int32_t chunk_step = n_chunk - m_last_chunk;
 387                m_last_chunk = n_chunk;
 388                if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
 389                    save_imatrix();
 390                }
 391                if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
 392                    save_imatrix(m_last_chunk);
 393                }
 394            }
 395        }
 396    }
 397
 398    return true;
 399}
 400
 401void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
 402    auto fname = m_params.out_file;
 403
 404    if (ncall > 0) {
 405        fname += ".at_";
 406        fname += std::to_string(ncall);
 407    }
 408
 409    // warn when writing imatrix entries that do not have full data
 410    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
 411
 412    int n_entries = 0;
 413    std::vector<std::string> to_store;
 414
 415    bool is_first = true; // for printing
 416    for (const auto & kv : m_stats) {
 417        const int n_all = kv.second.counts.size();
 418
 419        if (n_all == 0) {
 420            continue;
 421        }
 422
 423        int n_zeros = 0;
 424        for (const int c : kv.second.counts) {
 425            if (c == 0) {
 426                n_zeros++;
 427            }
 428        }
 429
 430        if (n_zeros != 0 && is_first) {
 431            LOG_INF("\n");
 432            is_first = false;
 433        }
 434
 435        if (n_zeros == n_all) {
 436            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
 437            continue;
 438        }
 439
 440        if (n_zeros > 0) {
 441            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
 442        }
 443
 444        n_entries++;
 445        to_store.push_back(kv.first);
 446    }
 447
 448    if (to_store.size() < m_stats.size()) {
 449        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
 450    }
 451
 452    // deterministic tensor name order
 453    std::sort(to_store.begin(), to_store.end());
 454
 455    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
 456
 457    std::ofstream out(fname, std::ios::binary);
 458    out.write((const char *) &n_entries, sizeof(n_entries));
 459    for (const auto & name : to_store) {
 460        const auto & stat = m_stats.at(name);
 461        const int32_t len = name.size();
 462        out.write((const char *) &len, sizeof(len));
 463        out.write(name.c_str(), len);
 464        // ceiling division to avoid accidental zeros
 465        const int32_t ncall = (*std::max_element(stat.counts.begin(), stat.counts.end()) + (chunk_size - 1)) / chunk_size;
 466        out.write((const char *) &ncall, sizeof(ncall));
 467        const int32_t nval = stat.values.size();
 468        const int32_t nmat = stat.counts.size();
 469        out.write((const char *) &nval, sizeof(nval));
 470        if (nval > 0 && nmat > 0) {
 471            std::vector<float> tmp(nval);
 472            for (int32_t i = 0; i < nval; i++) {
 473                float count = static_cast<float>(stat.counts[i / (nval / nmat)]);
 474                float value = stat.values[i];
 475                if (count == 0.0f) {
 476                    // store 1 for partial data
 477                    value = 1.0f;
 478                    count = 1.0f;
 479                }
 480                tmp[i] = (value / count) * static_cast<float>(ncall);
 481            }
 482            out.write((const char *) tmp.data(), nval * sizeof(float));
 483        }
 484    }
 485
 486    // Write the number of call the matrix was computed with
 487    out.write((const char *) &m_last_chunk, sizeof(m_last_chunk));
 488
 489    // Write the input filename at the end of the file to later on specify it in quantize
 490    {
 491        const char * dataset_file = m_params.prompt_file.c_str();
 492        int32_t len = m_params.prompt_file.size();
 493        // When there is no prompt but there were other imatrix files loaded, use the last dataset
 494        if (m_params.prompt_file.empty() && !m_datasets.empty()) {
 495            const std::string & dataset_str = m_datasets[m_datasets.size() - 1];
 496            dataset_file = dataset_str.c_str();
 497            len = dataset_str.size();
 498        }
 499        out.write((const char *) &len, sizeof(len));
 500        out.write(dataset_file, len);
 501    }
 502
 503    LOGV(1, "\n");
 504    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
 505}
 506
 507void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
 508    auto fname = m_params.out_file;
 509    int8_t use_legacy_format = m_params.imat_dat;
 510
 511    if (use_legacy_format > 0) {
 512        this->save_imatrix_legacy(n_chunk);
 513        return;
 514    }
 515    // only warn when `--output-format gguf` is not specified
 516    if (use_legacy_format == 0 && !string_ends_with(fname, ".gguf")) {
 517        LOG_WRN("\n%s: saving imatrix using GGUF format with a different suffix than .gguf\n", __func__);
 518        LOG_WRN("%s: if you want the previous imatrix format, use --output-format dat\n", __func__);
 519    }
 520
 521    if (n_chunk > 0) {
 522        fname += ".at_";
 523        fname += std::to_string(n_chunk);
 524    }
 525
 526    // write imatrix entries even if they don't have full data. (can be corrected when reading)
 527    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
 528
 529    std::vector<std::string> to_store;
 530    size_t data_size = 0;
 531
 532    bool is_first = true; // for printing
 533    for (const auto & kv : m_stats) {
 534        const int n_all = kv.second.counts.size();
 535
 536        int n_zeros = 0;
 537        for (const auto c : kv.second.counts) {
 538            if (c == 0) {
 539                n_zeros++;
 540            }
 541        }
 542
 543        if (n_zeros != 0 && is_first) {
 544            LOG_INF("\n");
 545            is_first = false;
 546        }
 547
 548        if (n_zeros > 0) {
 549            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
 550        }
 551
 552        to_store.push_back(kv.first);
 553        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
 554        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
 555    }
 556
 557    // deterministic tensor name order
 558    std::sort(to_store.begin(), to_store.end());
 559
 560    struct ggml_init_params params = {
 561        /* .mem_size   = */ data_size,
 562        /* .mem_buffer = */ NULL,
 563        /* .no_alloc   = */ false,
 564    };
 565    struct ggml_context * ctx = ggml_init(params);
 566    struct gguf_context * ctx_gguf = gguf_init_empty();
 567
 568    {
 569        std::vector<const char *> datasets;
 570        datasets.reserve(m_datasets.size() + 1);
 571        for (size_t i = 0; i < m_datasets.size(); ++i) {
 572            datasets.push_back(m_datasets[i].c_str());
 573        }
 574        if (!m_params.prompt_file.empty()) {
 575            datasets.push_back(m_params.prompt_file.c_str());
 576        }
 577
 578        gguf_set_val_str(ctx_gguf, "general.type", "imatrix");
 579        // Write the dataset paths
 580        gguf_set_arr_str(ctx_gguf, LLM_KV_IMATRIX_DATASETS, datasets.data(), datasets.size());
 581        // Write the number of chunks the matrix was computed with
 582        gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk);
 583        gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel);
 584    }
 585
 586    for (const auto & name : to_store) {
 587        const auto & stat = m_stats.at(name);
 588        const int32_t nval = (int32_t) stat.values.size();
 589        const int32_t nmat = (int32_t) stat.counts.size();
 590        if (nval > 0 && nmat > 0) {
 591            struct ggml_tensor * in_sum2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat);
 592            struct ggml_tensor * counts  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat);
 593            ggml_format_name(in_sum2, "%s.in_sum2", name.c_str());
 594            ggml_format_name(counts, "%s.counts", name.c_str());
 595
 596            for (int32_t j = 0; j < nval; ++j) {
 597                ((float *) in_sum2->data)[j] = (float) stat.values[j];
 598            }
 599            for (int32_t j = 0; j < nmat; ++j) {
 600                ((float *) counts->data)[j] = (float) stat.counts[j];
 601            }
 602
 603            gguf_add_tensor(ctx_gguf, in_sum2);
 604            gguf_add_tensor(ctx_gguf, counts);
 605        }
 606    }
 607
 608    gguf_write_to_file(ctx_gguf, fname.c_str(), false);
 609
 610    LOGV(1, "\n");
 611    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
 612
 613    gguf_free(ctx_gguf);
 614    ggml_free(ctx);
 615}
 616
 617bool IMatrixCollector::load_imatrix_legacy(const char * fname) {
 618    std::ifstream in(fname, std::ios::binary);
 619    if (!in) {
 620        LOG_ERR("%s: failed to open %s\n", __func__, fname);
 621        return false;
 622    }
 623    int n_entries;
 624    in.read((char *) &n_entries, sizeof(n_entries));
 625    if (in.fail() || n_entries < 1) {
 626        LOG_ERR("%s: no data in file %s\n", __func__, fname);
 627        return false;
 628    }
 629    // Guess the chunk size because it's not stored in the file
 630    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
 631
 632    for (int i = 0; i < n_entries; ++i) {
 633        int32_t len = 0;
 634        in.read((char *) &len, sizeof(len));
 635        std::vector<char> name_as_vec(len + 1);
 636        in.read((char *) name_as_vec.data(), len);
 637        if (in.fail()) {
 638            LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname);
 639            return false;
 640        }
 641        name_as_vec[len] = 0;
 642        std::string name{ name_as_vec.data() };
 643        auto & e = m_stats[std::move(name)];
 644        int32_t ncall = 0;
 645        in.read((char *) &ncall, sizeof(ncall));
 646        int32_t nval = 0;
 647        in.read((char *) &nval, sizeof(nval));
 648        if (in.fail() || nval < 1) {
 649            LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i);
 650            m_stats = {};
 651            return false;
 652        }
 653
 654        if (e.values.empty()) {
 655            e.values.resize(nval, 0.0f);
 656            e.counts.resize(1, 0);
 657        }
 658
 659        std::vector<float> tmp(nval);
 660        in.read((char *) tmp.data(), nval * sizeof(float));
 661        if (in.fail()) {
 662            LOG_ERR("%s: failed reading data for entry %d\n", __func__, i);
 663            m_stats = {};
 664            return false;
 665        }
 666
 667        // Recreate the state as expected by save_imatrix(), and correct for weighted sum.
 668        for (int i = 0; i < nval; i++) {
 669            e.values[i] += tmp[i] * chunk_size;
 670        }
 671        // The legacy format doesn't distinguish the counts for different experts
 672        for (size_t j = 0; j < e.counts.size(); ++j) {
 673            e.counts[j] += ncall * chunk_size;
 674        }
 675    }
 676
 677    {
 678        // TODO: extract into its own method; this is also used by the GGUF-based format
 679        // Calculate the last chunk count
 680        int64_t max_count = 0;
 681        for (const auto & stats : m_stats) {
 682            for (int64_t count : stats.second.counts) {
 683                if (count > max_count) {
 684                    max_count = count;
 685                }
 686            }
 687        }
 688        m_last_chunk = max_count / (chunk_size);
 689    }
 690
 691    {
 692        // Read the number of calls the matrix was computed with
 693        int32_t n_calls;
 694        in.read((char *) &n_calls, sizeof(n_calls));
 695        // ignore it because it's not important
 696    }
 697
 698    // Read the dataset path to include it when writing to GGUF
 699    if (!in.fail()){
 700        int32_t len = 0;
 701        in.read((char *) &len, sizeof(len));
 702        if (!in.fail()) {
 703            std::vector<char> dataset;
 704            dataset.resize(len + 1, 0);
 705            in.read(dataset.data(), len);
 706            if (!in.fail()) {
 707                m_datasets.push_back(dataset.data());
 708            }
 709        }
 710    }
 711
 712    return true;
 713}
 714
 715// Using GGUF as the file format, for greater extensibility
 716bool IMatrixCollector::load_imatrix(const char * file_name) {
 717    struct ggml_context * ctx = nullptr;
 718    struct gguf_init_params meta_gguf_params = {
 719        /* .no_alloc = */ false, // the data is needed
 720        /* .ctx      = */ &ctx,
 721    };
 722    struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params);
 723    if (!ctx_gguf) {
 724        return this->load_imatrix_legacy(file_name);
 725    }
 726    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
 727    if (n_entries < 1) {
 728        LOG_ERR("%s: no data in file %s\n", __func__, file_name);
 729        gguf_free(ctx_gguf);
 730        ggml_free(ctx);
 731        return false;
 732    }
 733
 734    const int64_t datasets_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS);
 735    if (datasets_key != -1 && gguf_get_arr_type(ctx_gguf, datasets_key) == GGUF_TYPE_STRING) {
 736        const int64_t n = gguf_get_arr_n(ctx_gguf, datasets_key);
 737        m_datasets.reserve(m_datasets.size() + n);
 738        for (int64_t i = 0; i < n; ++i) {
 739            m_datasets.push_back(gguf_get_arr_str(ctx_gguf, datasets_key, i));
 740        }
 741    }
 742
 743    const std::string in_sum2_suffix{ ".in_sum2" };
 744    const std::string counts_suffix{ ".counts" };
 745
 746    // Could re-use m_stats instead, but this allows
 747    // checking for completeness of *each* loaded imatrix file
 748    // and also makes it easier to re-use a similar implementation in quantize.cpp
 749    // Using an ordered map to get a deterministic iteration order.
 750    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
 751
 752    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
 753        std::string name = cur->name;
 754
 755        if (name.empty()) { continue; }
 756
 757        if (string_remove_suffix(name, in_sum2_suffix)) {
 758            // in_sum2
 759            sums_counts_for[std::move(name)].first = cur;
 760        } else if (string_remove_suffix(name, counts_suffix)) {
 761            // counts
 762            sums_counts_for[std::move(name)].second = cur;
 763        } else {
 764            // ignore other tensors
 765        }
 766    }
 767
 768    for (const auto & sc : sums_counts_for) {
 769        const std::string &        name    = sc.first;
 770        const struct ggml_tensor * in_sum2 = sc.second.first;
 771        const struct ggml_tensor * counts  = sc.second.second;
 772
 773        if (!in_sum2 || !counts) {
 774            LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str());
 775            gguf_free(ctx_gguf);
 776            ggml_free(ctx);
 777            return false;
 778        }
 779
 780        auto & e = m_stats[name];
 781
 782        int64_t nval = ggml_nelements(in_sum2);
 783        if (e.values.empty()) {
 784            e.values.resize(nval, 0.0f);
 785        } else if ((size_t) nval != e.values.size()) {
 786            LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size());
 787            gguf_free(ctx_gguf);
 788            ggml_free(ctx);
 789            return false;
 790        }
 791
 792        int64_t ncounts = ggml_nelements(counts);
 793        if (e.counts.empty()) {
 794            e.counts.resize(ncounts, 0);
 795        } else if (e.counts.size() == 1 && ncounts > 1) {
 796            // broadcast, when loading an old imatrix
 797            e.counts.resize(ncounts, e.counts[0]);
 798        } else if ((size_t) ncounts != e.counts.size()) {
 799            LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size());
 800            gguf_free(ctx_gguf);
 801            ggml_free(ctx);
 802            return false;
 803        }
 804
 805        // Recreate the state as expected by save_imatrix()
 806        for (int64_t j = 0; j < nval; j++) {
 807            e.values[j] += ((const float *) in_sum2->data)[j];
 808        }
 809        for (int64_t j = 0; j < ncounts; j++) {
 810            e.counts[j] += std::lround(((const float *) counts->data)[j]);
 811        }
 812    }
 813
 814    // TODO: extract into its own method; this is also used by the legacy format
 815    // Calculate the last chunk count
 816    int64_t max_count = 0;
 817    for (const auto & stats : m_stats) {
 818        for (int64_t count : stats.second.counts) {
 819            if (count > max_count) {
 820                max_count = count;
 821            }
 822        }
 823    }
 824    m_last_chunk = max_count / (m_params.n_ctx / m_params.n_parallel);
 825
 826    gguf_free(ctx_gguf);
 827    ggml_free(ctx);
 828    return true;
 829}
 830
 831static IMatrixCollector g_collector;
 832
 833static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
 834    return g_collector.collect_imatrix(t, ask, user_data);
 835}
 836
 837struct results_log_softmax {
 838    double log_softmax;
 839    float  logit;
 840    float  prob;
 841};
 842
 843static std::vector<float> softmax(const std::vector<float> & logits) {
 844    std::vector<float> probs(logits.size());
 845    float max_logit = logits[0];
 846    for (float v : logits) {
 847        max_logit = std::max(max_logit, v);
 848    }
 849    double sum_exp = 0.0;
 850    for (size_t i = 0; i < logits.size(); i++) {
 851        // Subtract the maximum logit value from the current logit value for numerical stability
 852        const float logit = logits[i] - max_logit;
 853        const float exp_logit = expf(logit);
 854        sum_exp += exp_logit;
 855        probs[i] = exp_logit;
 856    }
 857    for (size_t i = 0; i < probs.size(); i++) {
 858        probs[i] /= sum_exp;
 859    }
 860    return probs;
 861}
 862
 863static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
 864    float max_logit = logits[0];
 865    for (int i = 1; i < n_vocab; ++i) {
 866        max_logit = std::max(max_logit, logits[i]);
 867    }
 868    double sum_exp = 0.0;
 869    for (int i = 0; i < n_vocab; ++i) {
 870        sum_exp += expf(logits[i] - max_logit);
 871    }
 872    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 873}
 874
 875static void process_logits(
 876    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
 877    double & nll, double & nll2, float * logit_history, float * prob_history) {
 878    std::mutex mutex;
 879    int counter = 0;
 880    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
 881        double local_nll  = 0;
 882        double local_nll2 = 0;
 883        while (true) {
 884            std::unique_lock<std::mutex> lock(mutex);
 885            int i = counter++;
 886            if (i >= n_token) {
 887                nll += local_nll; nll2 += local_nll2;
 888                break;
 889            }
 890            lock.unlock();
 891            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
 892            const double v = -results.log_softmax;
 893            local_nll += v;
 894            local_nll2 += v*v;
 895
 896            logit_history[i] = results.logit;
 897            prob_history[i]  = results.prob;
 898        }
 899    };
 900    for (auto & w : workers) {
 901        w = std::thread(compute);
 902    }
 903    compute();
 904    for (auto & w : workers) {
 905        w.join();
 906    }
 907}
 908
 909static bool compute_imatrix(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
 910    const llama_model * model = llama_get_model(ctx);
 911    const llama_vocab * vocab = llama_model_get_vocab(model);
 912
 913    const bool add_bos = llama_vocab_get_add_bos(vocab);
 914
 915    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
 916
 917    auto tim1 = std::chrono::high_resolution_clock::now();
 918    LOG_INF("%s: tokenizing the input ..\n", __func__);
 919
 920    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true, params.parse_special);
 921
 922    auto tim2 = std::chrono::high_resolution_clock::now();
 923    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 924
 925    if (params.i_chunk > 0) {
 926        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
 927            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
 928            return false;
 929        }
 930        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
 931        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
 932    }
 933
 934    if (int(tokens.size()) < 2*n_ctx) {
 935        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
 936        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
 937        return false;
 938    }
 939
 940    std::vector<float> logit_history;
 941    std::vector<float> prob_history;
 942
 943    if (params.compute_ppl) {
 944        logit_history.resize(tokens.size());
 945        prob_history.resize(tokens.size());
 946    }
 947
 948    const int n_chunk_max = tokens.size() / n_ctx;
 949
 950    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
 951    const int n_vocab = llama_vocab_n_tokens(vocab);
 952    const int n_batch = params.n_batch;
 953
 954    int count = 0;
 955    double nll = 0.0;
 956    double nll2 = 0.0;
 957
 958    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
 959    const int n_seq = std::max(1, n_batch / n_ctx);
 960
 961    GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
 962    GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
 963
 964    llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
 965
 966    std::vector<float> logits;
 967    if (params.compute_ppl && num_batches > 1) {
 968        logits.reserve((size_t)n_ctx * n_vocab);
 969    }
 970
 971    LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
 972
 973    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 974
 975    for (int i = 0; i < n_chunk; i += n_seq) {
 976        const int start =     i * n_ctx;
 977        const int end   = start + n_ctx;
 978
 979        const int n_seq_batch = std::min(n_seq, n_chunk - i);
 980
 981        const auto t_start = std::chrono::high_resolution_clock::now();
 982
 983        // clear the KV cache
 984        llama_memory_clear(llama_get_memory(ctx), true);
 985
 986        for (int j = 0; j < num_batches; ++j) {
 987            const int batch_start = start + j * n_batch;
 988            const int batch_size  = std::min(end - batch_start, n_batch);
 989
 990            // clear the batch
 991            common_batch_clear(batch);
 992
 993            for (int seq = 0; seq < n_seq_batch; seq++) {
 994                int seq_start = batch_start + seq*n_ctx;
 995
 996                // save original token and restore it after eval
 997                const auto token_org = tokens[seq_start];
 998
 999                // add BOS token for the first batch of each chunk
1000                if (add_bos && j == 0) {
1001                    tokens[seq_start] = llama_vocab_bos(vocab);
1002                }
1003                for (int k = 0; k < batch_size; ++k) {
1004                    // NOTE: specifying all logits to get activations for the output.weight tensor
1005                    //       and also for the perplexity calculation.
1006                    // TODO: only get outputs when (params.process_output || params.compute_ppl)
1007                    //       (not possible when this skips FFN computation of the last layer)
1008                    common_batch_add(batch, tokens[seq_start + k], j*n_batch + k, { seq }, true);
1009                }
1010
1011                // restore the original token in case it was set to BOS
1012                tokens[seq_start] = token_org;
1013            }
1014
1015            if (llama_decode(ctx, batch)) {
1016                LOG_ERR("%s : failed to eval\n", __func__);
1017                llama_batch_free(batch);
1018                return false;
1019            }
1020
1021            if (params.compute_ppl && num_batches > 1) {
1022                const auto * batch_logits = llama_get_logits(ctx);
1023                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
1024            }
1025        }
1026
1027
1028        if (i == 0) {
1029            llama_synchronize(ctx);
1030            const auto t_end = std::chrono::high_resolution_clock::now();
1031            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
1032            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
1033            int total_seconds = (int)(t_total * n_chunk / n_seq);
1034            if (total_seconds >= 60*60) {
1035                LOG("%d hours ", total_seconds / (60*60));
1036                total_seconds = total_seconds % (60*60);
1037            }
1038            LOG("%.2f minutes\n", total_seconds / 60.0);
1039        }
1040
1041        if (params.compute_ppl) {
1042            const int first = n_ctx/2;
1043            for (int seq = 0; seq < n_seq_batch; seq++) {
1044                const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx);
1045
1046                llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
1047
1048                process_logits(n_vocab, all_logits + first*n_vocab,
1049                        tokens_data, n_ctx - 1 - first,
1050                        workers, nll, nll2,
1051                        logit_history.data() + start + seq*n_ctx + first,
1052                        prob_history.data()  + start + seq*n_ctx + first);
1053
1054                count += n_ctx - first - 1;
1055
1056                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
1057            }
1058            fflush(stdout);
1059
1060            logits.clear();
1061        }
1062    }
1063
1064    LOG("\n");
1065
1066    if (params.compute_ppl) {
1067        nll2 /= count;
1068        nll /= count;
1069        const double ppl = exp(nll);
1070        nll2 -= nll * nll;
1071        if (nll2 > 0) {
1072            nll2 = sqrt(nll2/(count-1));
1073            LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
1074        } else {
1075            LOG("Unexpected negative standard deviation of log(prob)\n");
1076        }
1077    }
1078
1079    llama_batch_free(batch);
1080
1081    return true;
1082}
1083
1084static bool show_statistics(const common_params & params) {
1085    std::vector<tensor_statistics> ts;
1086    if (params.in_files.empty() || params.in_files.size() > 1) {
1087        LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n");
1088        return false;
1089    }
1090    if (g_collector.load_imatrix(params.in_files[0].c_str())) {
1091        for (const auto & [name, stats] :g_collector.get_mstats()) {
1092            compute_statistics(ts, name, stats);
1093        }
1094    } else {
1095        LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str());
1096        return false;
1097    }
1098    if (!ts.empty()) {
1099        compute_cossim(ts);
1100    } else {
1101        LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str());
1102        return false;
1103    }
1104
1105    struct tensor_comparer {
1106        bool operator()(const tensor_statistics & a, const tensor_statistics & b) const {
1107            std::string layer, name_a, name_b;
1108            ;
1109            process_tensor_name(a.tensor, layer, name_a);
1110            process_tensor_name(b.tensor, layer, name_b);
1111            return name_a < name_b || (name_a == name_b && a.total_sqract > b.total_sqract);
1112        }
1113    };
1114    std::sort(ts.begin(), ts.end(), tensor_comparer());
1115
1116    struct weighted_stats {
1117        float weighted_bias   = 0.0f;
1118        float weighted_zd     = 0.0f;
1119        float weighted_cossim = 0.0f;
1120        int   total_elements  = 0;
1121    };
1122    std::map<int, weighted_stats> ws;
1123
1124    LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast<int>(ts.size()));
1125    LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", "       Tensor", "          Σ(Act²)",
1126            "  Min", "            Max", "           μ", "   σ", " % Active", "N", "   Entropy", "E (norm)", "ZD",
1127            "  CosSim");
1128    LOG_INF(
1129        "=============================================================================================================="
1130        "===========================================================\n");
1131    for (const auto & tstat : ts) {
1132        std::string layer, name;
1133        process_tensor_name(tstat.tensor, layer, name);
1134
1135        int blk;
1136        try {
1137            blk = std::stoi(layer);
1138        } catch (const std::exception & e) {
1139            blk = -1;  // not a block layer
1140        }
1141
1142        LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n",
1143                layer.c_str(), name.c_str(), tstat.total_sqract, tstat.min_sqract, tstat.max_sqract, tstat.mean_sqract,
1144                tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy,
1145                100.0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim);
1146
1147        const float weighted_bias   = tstat.elements * tstat.total_sqract;
1148        const float weighted_zd     = tstat.elements * tstat.zd;
1149        const float weighted_cossim = tstat.elements * tstat.cossim;
1150
1151        if (ws.find(blk) != ws.end()) {
1152            ws[blk].weighted_bias += weighted_bias;
1153            ws[blk].weighted_zd += weighted_zd;
1154            ws[blk].weighted_cossim += weighted_cossim;
1155            ws[blk].total_elements += tstat.elements;
1156        } else {
1157            weighted_stats temp_ws;
1158            temp_ws.weighted_bias   = weighted_bias;
1159            temp_ws.weighted_zd     = weighted_zd;
1160            temp_ws.weighted_cossim = weighted_cossim;
1161            temp_ws.total_elements  = tstat.elements;
1162            ws[blk]                 = temp_ws;
1163        }
1164    }
1165
1166    const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; });
1167    LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers);
1168    LOG_INF("\n%s\t%s\t%s\t%s\n", "  Layer", "     μΣ(Act²)", "      μZD", "μCosSim");
1169    LOG_INF("================================================\n");
1170    for (const auto & [first, second] : ws) {
1171        const auto & layer = first;
1172        const auto & stats = second;
1173
1174        if (stats.total_elements == 0) {
1175            continue;
1176        }
1177
1178        if (layer >= 0) {
1179            const float bias   = stats.weighted_bias / stats.total_elements;
1180            const float zd     = stats.weighted_zd / stats.total_elements;
1181            const float cossim = stats.weighted_cossim / stats.total_elements;
1182
1183            LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, 100.0f * zd, cossim);
1184        }
1185    }
1186    LOG_INF("\n");
1187
1188    return true;
1189}
1190
1191int main(int argc, char ** argv) {
1192    common_params params;
1193
1194    params.out_file = "imatrix.gguf";
1195
1196    params.n_ctx = 512;
1197    params.escape = false;
1198
1199    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
1200        return 1;
1201    }
1202
1203    if (params.show_statistics) {
1204        if (!show_statistics(params)) {
1205            return 1;
1206        }
1207        return 0;
1208    }
1209
1210    common_init();
1211
1212    const int32_t n_ctx = params.n_ctx;
1213
1214    if (n_ctx <= 0) {
1215        LOG_ERR("%s: imatrix tool requires '--ctx-size' > 0\n", __func__);
1216        return 1;
1217    }
1218
1219    {
1220        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
1221        const int32_t n_kv = n_seq * n_ctx;
1222
1223        params.n_parallel = n_seq;
1224        params.n_ctx      = n_kv;
1225
1226        params.n_batch = std::min(params.n_batch, n_kv);
1227    }
1228
1229    g_collector.set_params(params);
1230
1231    for (const auto & in_file : params.in_files) {
1232        LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
1233        if (!g_collector.load_imatrix(in_file.c_str())) {
1234            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
1235            return 1;
1236        }
1237    }
1238
1239    if (params.prompt.empty()) {
1240        LOG_INF("No prompt provided; combining precomputed matrices only.\n");
1241
1242        if (params.in_files.empty()) {
1243            LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
1244            return 1;
1245        }
1246
1247        if (params.in_files.size() == 1) {
1248            LOG_INF("%s : saving imatrix to '%s'\n", __func__, params.out_file.c_str());
1249        } else if (params.in_files.size() > 1) {
1250            LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
1251        }
1252
1253        g_collector.save_imatrix();
1254
1255        return 0;
1256    }
1257
1258    llama_backend_init();
1259    llama_numa_init(params.numa);
1260
1261    // pass the callback to the backend scheduler
1262    // it will be executed for each node during the graph computation
1263    params.cb_eval = ik_collect_imatrix;
1264    params.cb_eval_user_data = NULL;
1265    params.warmup = false;
1266
1267    // init
1268    auto llama_init = common_init_from_params(params);
1269
1270    auto * model = llama_init->model();
1271    auto * ctx   = llama_init->context();
1272
1273    if (model == nullptr || ctx == nullptr) {
1274        LOG_ERR("%s : failed to init\n", __func__);
1275        return 1;
1276    }
1277
1278    const int n_ctx_train = llama_model_n_ctx_train(model);
1279    if (params.n_ctx > n_ctx_train) {
1280        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
1281                __func__, n_ctx_train, params.n_ctx);
1282    }
1283
1284    // print system information
1285    {
1286        LOG_INF("\n");
1287        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
1288    }
1289
1290    if (!compute_imatrix(ctx, params, n_ctx)) {
1291        return 1;
1292    }
1293
1294    g_collector.save_imatrix();
1295
1296    LOG("\n");
1297    llama_perf_context_print(ctx);
1298
1299    llama_backend_free();
1300
1301    return 0;
1302}