llmnpc - llama.cpp/src/llama-vocab.cpp

Path: llmnpc / llama.cpp / src / llama-vocab.cpp (raw)
   1#include "llama-vocab.h"
   2
   3#include "ggml.h"
   4#include "gguf.h"
   5#include "llama-impl.h"
   6#include "llama-model-loader.h"
   7
   8#include "unicode.h"
   9
  10#include <algorithm>
  11#include <cassert>
  12#include <cctype>
  13#include <cfloat>
  14#include <cmath>
  15#include <cstdarg>
  16#include <cstring>
  17#include <forward_list>
  18#include <limits>
  19#include <map>
  20#include <queue>
  21#include <set>
  22#include <unordered_map>
  23
  24//
  25// helpers
  26//
  27
  28struct naive_trie {
  29    naive_trie() : has_value(false), value(0) {
  30    }
  31    void insert(const char * key, size_t len, int32_t value = 0) {
  32        if (len == 0) {
  33            this->has_value = true;
  34            this->value = value;
  35            return;
  36        }
  37        char c = key[0];
  38        auto res = children.find(c);
  39        if (res != children.end()) {
  40            res->second.insert(key + 1, len - 1, value);
  41        } else {
  42            auto res = children.insert(std::make_pair(c, naive_trie()));
  43            res.first->second.insert(key + 1, len - 1, value);
  44        }
  45    }
  46    std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
  47        if (len == 0 || offset == len) {
  48            return std::make_pair(key, offset);
  49        }
  50        char c = key[offset];
  51        auto res = children.find(c);
  52        if (res != children.end()) {
  53            return res->second.get_longest_prefix(key, len, offset + 1);
  54        }
  55
  56        return std::make_pair(key, offset);
  57    }
  58    const struct naive_trie * traverse(const char c) const {
  59        auto res = children.find(c);
  60        if (res != children.end()) {
  61            return &res->second;
  62        }
  63
  64        return NULL;
  65    }
  66    std::map<char, struct naive_trie> children;
  67    bool has_value;
  68    llama_token value;
  69};
  70
  71//
  72// tokenizers
  73//
  74
  75struct llm_tokenizer {
  76    llm_tokenizer() {}
  77    virtual ~llm_tokenizer() = default;
  78};
  79
  80struct llm_symbol {
  81    using index = int;
  82    index prev;
  83    index next;
  84    const char * text;
  85    size_t n;
  86};
  87
  88static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
  89
  90//
  91// SPM tokenizer
  92// original implementation:
  93// https://github.com/ggml-org/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
  94//
  95
  96struct llm_bigram_spm {
  97    struct comparator {
  98        bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
  99            return (l.score < r.score) || (l.score == r.score && l.left > r.left);
 100        }
 101    };
 102    using queue_storage = std::vector<llm_bigram_spm>;
 103    using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
 104    llm_symbol::index left;
 105    llm_symbol::index right;
 106    float score;
 107    size_t size;
 108};
 109
 110struct llm_tokenizer_spm : llm_tokenizer {
 111    llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
 112};
 113
 114struct llm_tokenizer_spm_session {
 115    llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
 116
 117    void tokenize(const std::string & text, std::vector<llama_token> & output) {
 118        // split string into utf8 chars
 119        int index = 0;
 120        size_t offs = 0;
 121        while (offs < text.size()) {
 122            llm_symbol sym;
 123            size_t len = unicode_len_utf8(text[offs]);
 124            sym.text = text.c_str() + offs;
 125            sym.n = std::min(len, text.size() - offs);
 126            offs += sym.n;
 127            sym.prev = index - 1;
 128            sym.next = offs == text.size() ? -1 : index + 1;
 129            index++;
 130            symbols.emplace_back(sym);
 131        }
 132
 133        // seed the work queue with all possible 2-character tokens.
 134        for (int i = 1; i < (int) symbols.size(); ++i) {
 135            try_add_bigram(i - 1, i);
 136        }
 137
 138        // keep substituting the highest frequency pairs for as long as we can.
 139        while (!work_queue.empty()) {
 140            auto bigram = work_queue.top();
 141            work_queue.pop();
 142
 143            auto & left_sym = symbols[bigram.left];
 144            auto & right_sym = symbols[bigram.right];
 145
 146            // if one of the symbols already got merged, skip it.
 147            if (left_sym.n == 0 || right_sym.n == 0 ||
 148                left_sym.n + right_sym.n != bigram.size) {
 149                continue;
 150            }
 151
 152            // merge the right sym into the left one
 153            left_sym.n += right_sym.n;
 154            right_sym.n = 0;
 155
 156            //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
 157
 158            // remove the right sym from the chain
 159            left_sym.next = right_sym.next;
 160            if (right_sym.next >= 0) {
 161                symbols[right_sym.next].prev = bigram.left;
 162            }
 163
 164            // find more substitutions
 165            try_add_bigram(left_sym.prev, bigram.left);
 166            try_add_bigram(bigram.left, left_sym.next);
 167        }
 168
 169        for (int i = 0; i != -1; i = symbols[i].next) {
 170            auto & symbol = symbols[i];
 171            resegment(symbol, output);
 172        }
 173    }
 174
 175private:
 176    void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
 177        auto text = std::string(symbol.text, symbol.n);
 178        auto token = vocab.text_to_token(text);
 179
 180        // Do we need to support is_unused?
 181        if (token != LLAMA_TOKEN_NULL) {
 182            output.push_back(token);
 183            return;
 184        }
 185
 186        const auto p = rev_merge.find(text);
 187
 188        if (p == rev_merge.end()) {
 189            // output any symbols that did not form tokens as bytes.
 190            output.reserve(output.size() + symbol.n);
 191            for (int j = 0; j < (int)symbol.n; ++j) {
 192                llama_token id = vocab.byte_to_token(symbol.text[j]);
 193                output.push_back(id);
 194            }
 195            return;
 196        }
 197
 198        resegment(symbols[p->second.first], output);
 199        resegment(symbols[p->second.second], output);
 200    }
 201
 202    void try_add_bigram(int left, int right) {
 203        if (left == -1 || right == -1) {
 204            return;
 205        }
 206        const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
 207        auto token = vocab.text_to_token(text);
 208
 209        if (token == LLAMA_TOKEN_NULL) {
 210            return;
 211        }
 212
 213        if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
 214            return;
 215        }
 216
 217        const auto & tok_data = vocab.get_token_data(token);
 218
 219        llm_bigram_spm bigram;
 220        bigram.left  = left;
 221        bigram.right = right;
 222        bigram.score = tok_data.score;
 223        bigram.size  = text.size();
 224
 225        work_queue.push(bigram);
 226
 227        // Do we need to support is_unused?
 228        rev_merge[text] = std::make_pair(left, right);
 229    }
 230
 231    const llama_vocab & vocab;
 232    // currently unused
 233    // const llm_tokenizer_spm * spm_tokenizer;
 234
 235    std::vector<llm_symbol> symbols;
 236    llm_bigram_spm::queue work_queue;
 237    std::map<std::string, std::pair<int, int>> rev_merge;
 238};
 239
 240//
 241// BPE tokenizer
 242// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
 243// tried to simplify unicode stuff, so most likely does not work 100% correctly!
 244//
 245
 246// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
 247
 248template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
 249class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
 250public:
 251    using std::priority_queue<T, Container, Compare>::priority_queue;
 252
 253    T pop_move() {
 254        T item = std::move(this->c.front());
 255        std::pop_heap(this->c.begin(), this->c.end(), this->comp);
 256        this->c.pop_back();
 257        return item;
 258    }
 259
 260    void pop() =  delete;
 261};
 262
 263struct llm_bigram_bpe {
 264    struct comparator {
 265        bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
 266            return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
 267        }
 268    };
 269
 270    using queue_storage = std::vector<llm_bigram_bpe>;
 271    using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
 272    llm_symbol::index left;
 273    llm_symbol::index right;
 274    std::string text;
 275    int rank;
 276    size_t size;
 277};
 278
 279struct llm_tokenizer_bpe : llm_tokenizer {
 280    llm_tokenizer_bpe(const llama_vocab & vocab) {
 281        GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
 282        switch (vocab.get_pre_type()) {
 283            case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
 284                regex_exprs = {
 285                    // original regex from tokenizer.json
 286                    //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 287
 288                    // adapted: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2080233989
 289                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 290                };
 291                break;
 292            case LLAMA_VOCAB_PRE_TYPE_DBRX:
 293            case LLAMA_VOCAB_PRE_TYPE_SMAUG:
 294                regex_exprs = {
 295                    // same as llama3
 296                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 297                };
 298                break;
 299            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
 300                regex_exprs = {
 301                    "[\r\n]",
 302                    "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
 303                    "\\s?[!-/:-~！-／：-～‘-‟　-。]+",
 304                    "\\s+$",
 305                    "[一-龥ࠀ-一가-퟿]+",
 306                    "\\p{N}+",
 307                };
 308                break;
 309            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
 310            case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
 311                regex_exprs = {
 312                    "\\p{N}{1,3}",
 313                    "[一-龥぀-ゟ゠-ヿ]+",
 314                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
 315                };
 316                break;
 317            case LLAMA_VOCAB_PRE_TYPE_YOUTU:
 318                regex_exprs = {
 319                    "[가-힣ㄱ-ㆎ]+|[！…“”‘’—：；，、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥぀-ゟ゠-ヿ]+",
 320                    "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 321                };
 322                break;
 323            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
 324                regex_exprs = {
 325                    "[\r\n]",
 326                    "\\s?\\p{L}+",
 327                    "\\s?\\p{P}+",
 328                    "[一-龥ࠀ-一가-퟿]+",
 329                    "\\p{N}",
 330                };
 331                break;
 332            case LLAMA_VOCAB_PRE_TYPE_FALCON:
 333                regex_exprs = {
 334                    "[\\p{P}\\$\\+<=>\\^~\\|`]+",
 335                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
 336                    "[0-9][0-9][0-9]",
 337                };
 338                break;
 339            case LLAMA_VOCAB_PRE_TYPE_STARCODER:
 340            case LLAMA_VOCAB_PRE_TYPE_REFACT:
 341            case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
 342            case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
 343            case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
 344            case LLAMA_VOCAB_PRE_TYPE_EXAONE:
 345            case LLAMA_VOCAB_PRE_TYPE_MINERVA:
 346                regex_exprs = {
 347                    "\\p{N}",
 348                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
 349                };
 350                break;
 351            case LLAMA_VOCAB_PRE_TYPE_GPT2:
 352            case LLAMA_VOCAB_PRE_TYPE_MPT:
 353            case LLAMA_VOCAB_PRE_TYPE_OLMO:
 354            case LLAMA_VOCAB_PRE_TYPE_JAIS:
 355            case LLAMA_VOCAB_PRE_TYPE_TRILLION:
 356            case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
 357                regex_exprs = {
 358                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
 359                };
 360                break;
 361            case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
 362            case LLAMA_VOCAB_PRE_TYPE_QWEN2:
 363            case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
 364            case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
 365                regex_exprs = {
 366                    // original regex from tokenizer.json
 367                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
 368                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 369                };
 370                break;
 371            case LLAMA_VOCAB_PRE_TYPE_QWEN35:
 372                regex_exprs = {
 373                    // original regex from tokenizer.json
 374                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
 375                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 376                };
 377                break;
 378            case LLAMA_VOCAB_PRE_TYPE_PORO:
 379            case LLAMA_VOCAB_PRE_TYPE_BLOOM:
 380            case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
 381                regex_exprs = {
 382                    " ?[^(\\s|.,!?…。，、।۔،)]+",
 383                };
 384                break;
 385            case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
 386                regex_exprs = {
 387                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 388                };
 389                break;
 390            case LLAMA_VOCAB_PRE_TYPE_VIKING:
 391                regex_exprs = {
 392                    " ?[^(\\s|.,!?…。，、।۔،)]+",
 393                    "\\p{N}",
 394                };
 395                break;
 396            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
 397                // original regex from tokenizer.json
 398                // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
 399                regex_exprs = {
 400                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 401                };
 402                break;
 403            case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
 404                // Note: in theory, the special token (sentinel and image token) regex_exprs below
 405                // are unnecessary, as they are split in `tokenizer_st_partition` anyway.
 406                // However, since the upstream pre-tokenizer uses them, they are also
 407                // included here (see https://huggingface.co/facebook/chameleon-7b).
 408                regex_exprs = {
 409                    "<sentinel:[0-9]+>",  // Sentinel tokens
 410                    "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z",  // Image tokens
 411                    "([\\t\\n]|    |  )",  // directly from tokenizer.json
 412                    "\\p{N}", // Individual digits
 413                    "[\\p{P}!-/:-@\\[-`{-~]",  // Punctuation, Isolated
 414                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
 415                };
 416                break;
 417            case LLAMA_VOCAB_PRE_TYPE_GPT4O:
 418            case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2:
 419                regex_exprs = {
 420                    // original regex from tokenizer.json
 421                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 422                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 423                };
 424                break;
 425            case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
 426                regex_exprs = {
 427                    // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
 428                    // The custom handler implements all K2 patterns with proper Han character exclusion
 429                    "\\p{Han}+",
 430                };
 431                break;
 432            case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
 433                regex_exprs = {
 434                    "\\p{N}+",
 435                    "(?=(\\d{3})+(?!\\d))",
 436                };
 437                break;
 438            case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
 439                regex_exprs = {
 440                    // original regex from tokenizer.json
 441                    // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
 442                    // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
 443                    "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
 444                };
 445                break;
 446            case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
 447                regex_exprs = {
 448                    // original regex from tokenizer.json
 449                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
 450                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 451                };
 452                break;
 453            case LLAMA_VOCAB_PRE_TYPE_GROK_2:
 454                regex_exprs = {
 455                    // original regex from tokenizer.json
 456                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
 457                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 458                };
 459                break;
 460            case LLAMA_VOCAB_PRE_TYPE_AFMOE:
 461                regex_exprs = {
 462                    // Digit handling - uses custom implementation in unicode.cpp
 463                    // Groups digits with leading 1-2 based on total length modulo 3
 464                    "\\p{AFMoE_digits}",
 465                    // CJK and Asian scripts (using direct Unicode literals)
 466                    "[一-鿿㐀-䶿豈-﫿぀-ゟ゠-ヿ･-ﾟ⼀-⿟เ-๿຀-໿ក-៿က-႟ꩠ-ꩿꧠ-꧿가-힯ᄀ-ᇿ]+",
 467                    // Main BPE pattern
 468                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
 469                };
 470                break;
 471            case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
 472                regex_exprs = {
 473                    // original regex from tokenizer.json
 474                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
 475                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
 476                };
 477                break;
 478            default:
 479                // default regex for BPE tokenization pre-processing
 480                regex_exprs = {
 481                    "[\\p{P}\\$\\+<=>\\^~\\|]+",
 482                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
 483                    "\\p{N}+",
 484                    "[0-9][0-9][0-9]",
 485                };
 486                break;
 487        }
 488    }
 489
 490    std::vector<std::string> regex_exprs;
 491};
 492
 493struct llm_tokenizer_bpe_session {
 494    llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
 495
 496    static void append(const llama_token token_id, std::vector<llama_token> & output)  {
 497        output.push_back(token_id);
 498    }
 499
 500    bool append_bos(std::vector<llama_token> & output) const {
 501        if (vocab.get_add_bos()) {
 502            GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
 503            output.push_back(vocab.token_bos());
 504            return true;
 505        }
 506        return false;
 507    }
 508
 509    bool append_eos(std::vector<llama_token> & output) const {
 510        if (vocab.get_add_eos()) {
 511            GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
 512            output.push_back(vocab.token_eos());
 513            return true;
 514        }
 515        return false;
 516    }
 517
 518    void check_double_bos_eos(const std::vector<llama_token> & output) const {
 519        if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
 520            LLAMA_LOG_WARN(
 521                "%s: Added a BOS token to the prompt as specified by the model but the prompt "
 522                "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
 523                "Are you sure this is what you want?\n", __FUNCTION__);
 524        }
 525        if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
 526            LLAMA_LOG_WARN(
 527                "%s: Added a EOS token to the prompt as specified by the model but the prompt "
 528                "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
 529                "Are you sure this is what you want?\n", __FUNCTION__);
 530        }
 531    }
 532
 533    void tokenize(const std::string & text, std::vector<llama_token> & output) {
 534        int final_prev_index = -1;
 535        const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
 536
 537        symbols_final.clear();
 538
 539        for (const auto & word : word_collection) {
 540            work_queue = llm_bigram_bpe::queue();
 541            symbols.clear();
 542
 543            int index = 0;
 544            size_t offset = 0;
 545
 546            //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
 547            if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
 548                symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
 549                offset = word.size();
 550            }
 551
 552            while (offset < word.size()) {
 553                llm_symbol sym;
 554                size_t char_len = std::min(word.size() - offset, (size_t) unicode_len_utf8(word[offset]));
 555                sym.text = word.c_str() + offset;
 556                sym.n = char_len;
 557                offset += sym.n;
 558                sym.prev = index - 1;
 559                sym.next = offset == word.size() ? -1 : index + 1;
 560                index++;
 561                symbols.emplace_back(sym);
 562            }
 563            for (int i = 1; i < (int) symbols.size(); ++i) {
 564                add_new_bigram(i - 1, i);
 565            }
 566
 567            // build token(s)
 568            while (!work_queue.empty()) {
 569                auto bigram = work_queue.pop_move();
 570
 571                auto & left_symbol = symbols[bigram.left];
 572                auto & right_symbol = symbols[bigram.right];
 573
 574                if (left_symbol.n == 0 || right_symbol.n == 0) {
 575                    continue;
 576                }
 577                std::string left_token = std::string(left_symbol.text, left_symbol.n);
 578                std::string right_token = std::string(right_symbol.text, right_symbol.n);
 579                if (left_token + right_token != bigram.text) {
 580                    continue;  // Skip this bigram if it's outdated
 581                }
 582
 583                // merge the right sym into the left one
 584                left_symbol.n += right_symbol.n;
 585                right_symbol.n = 0;
 586
 587                // remove the right sym from the chain
 588                left_symbol.next = right_symbol.next;
 589                if (right_symbol.next >= 0) {
 590                    symbols[right_symbol.next].prev = bigram.left;
 591                }
 592
 593                add_new_bigram(left_symbol.prev, bigram.left);  // left side of current symbol
 594                add_new_bigram(bigram.left, left_symbol.next);  // right side of current symbol
 595            }
 596
 597            // add the finished tokens to the final list keeping correct order for next and prev
 598            for (auto & sym : symbols) {
 599                if (sym.n > 0) {
 600                    sym.prev = final_prev_index;
 601                    sym.next = -1;
 602                    if (final_prev_index != -1) {
 603                        symbols_final[final_prev_index].next = symbols_final.size();
 604                    }
 605                    symbols_final.emplace_back(sym);
 606                    final_prev_index = symbols_final.size() - 1;
 607                }
 608            }
 609        }
 610
 611        symbols = symbols_final;
 612
 613        if (!symbols.empty()) {
 614            for (int i = 0; i != -1; i = symbols[i].next) {
 615                auto & symbol = symbols[i];
 616                if (symbol.n == 0) {
 617                    continue;
 618                }
 619
 620                const std::string str = std::string(symbol.text, symbol.n);
 621                const auto token = vocab.text_to_token(str);
 622
 623                if (token == LLAMA_TOKEN_NULL) {
 624                    for (auto j = str.begin(); j != str.end(); ++j) {
 625                        std::string byte_str(1, *j);
 626                        auto token_multibyte = vocab.text_to_token(byte_str);
 627                        if (token_multibyte != LLAMA_TOKEN_NULL) {
 628                            output.push_back(token_multibyte);
 629                        }
 630                    }
 631                } else {
 632                    output.push_back(token);
 633                }
 634            }
 635        }
 636    }
 637
 638private:
 639    void add_new_bigram(int left, int right) {
 640        if (left == -1 || right == -1) {
 641            return;
 642        }
 643        std::string left_token  = std::string(symbols[left].text,  symbols[left].n);
 644        std::string right_token = std::string(symbols[right].text, symbols[right].n);
 645
 646        int rank_found = -1;
 647
 648        rank_found = vocab.find_bpe_rank(left_token, right_token);
 649
 650        if (rank_found < 0) {
 651            return;
 652        }
 653
 654        llm_bigram_bpe bigram;
 655
 656        bigram.left  = left;
 657        bigram.right = right;
 658        bigram.text  = left_token + right_token;
 659        bigram.size  = left_token.size() + right_token.size();
 660        bigram.rank  = rank_found;
 661
 662        work_queue.push(bigram);
 663    }
 664
 665    const llama_vocab & vocab;
 666    const llm_tokenizer_bpe & tokenizer;
 667
 668    std::vector<llm_symbol> symbols;
 669    std::vector<llm_symbol> symbols_final;
 670    llm_bigram_bpe::queue work_queue;
 671};
 672
 673//
 674// WPM tokenizer
 675//
 676
 677struct llm_tokenizer_wpm : llm_tokenizer {
 678    llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
 679};
 680
 681struct llm_tokenizer_wpm_session {
 682    llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
 683
 684    void tokenize(const std::string & text, std::vector<llama_token> & output) {
 685        // normalize and split by whitespace
 686        std::vector<std::string> words = preprocess(text);
 687        // bos token prepended already
 688
 689        // find the longest tokens that form the words
 690        for (const std::string & word : words) {
 691            // skip empty words
 692            if (word.size() == 0) {
 693                continue;
 694            }
 695
 696            // prepend phantom space
 697            const std::string word1 = "\xe2\x96\x81" + word;
 698            const int n = word1.size();
 699
 700            const size_t current_tokens = output.size();
 701
 702            // we're at the start of a new word
 703            // move through character position in word
 704            for (int i = 0; i < n; ++i) {
 705                // loop through possible match length
 706                bool match = false;
 707                for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) {
 708                    auto id = vocab.text_to_token(word1.substr(i, j - i));
 709                    if (id != LLAMA_TOKEN_NULL) {
 710                        output.push_back(id);
 711                        match = true;
 712                        i = j - 1;
 713                        break;
 714                    }
 715                }
 716
 717                if (!match) { // discard all
 718                    output.resize(current_tokens);
 719                    break;  // and discard next tokens
 720                }
 721            }
 722
 723            // we didn't find any matches for this word
 724            if (current_tokens == output.size()) {
 725                output.push_back(vocab.token_unk());
 726            }
 727        }
 728    }
 729
 730    // TODO: reduce string copies by using cpts_offs array
 731    static std::vector<std::string> preprocess(const std::string & text)  {
 732        const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
 733        std::vector<std::string> words(1, "");
 734
 735        for (const uint32_t cpt : cpts_nfd) {
 736            const auto flags = unicode_cpt_flags_from_cpt(cpt);
 737
 738            if (flags.is_whitespace) {
 739                if (words.back().size()) {  // finish previous word if any
 740                    words.emplace_back();
 741                }
 742                continue;
 743            }
 744
 745            assert (!flags.is_separator);
 746            if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
 747                continue;
 748            }
 749
 750            const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
 751            if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
 752                if (words.back().size()) {  // finish previous word if any
 753                    words.emplace_back();
 754                }
 755                words.back() = s;       // single char word
 756                words.emplace_back();   // start a new word
 757            } else {
 758                words.back() += s;  // append char to word
 759            }
 760        }
 761
 762        if (!words.back().size()) {
 763            words.pop_back();
 764        }
 765
 766        return words;
 767    }
 768
 769    static bool is_chinese_char(uint32_t cpt) {
 770        return
 771            (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
 772            (cpt >= 0x03400 && cpt <= 0x04DBF) ||
 773            (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
 774            (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
 775            (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
 776            (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
 777            (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
 778            (cpt >= 0x2F800 && cpt <= 0x2FA1F);
 779            //(cpt >= 0x3000  && cpt <= 0x303F)  ||
 780            //(cpt >= 0xFF00  && cpt <= 0xFFEF);
 781    }
 782
 783private:
 784    const llama_vocab & vocab;
 785    // currently unused
 786    // const llm_tokenizer_wpm * wpm_tokenizer;
 787};
 788
 789//
 790// UGM tokenizer
 791//
 792
 793struct llm_tokenizer_ugm : llm_tokenizer {
 794    llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
 795        if (precompiled_charsmap.size() > 0) {
 796            size_t charsmap_offset = 0;
 797
 798            // First four bytes of precompiled_charsmap contains length of binary
 799            // blob containing XOR-compressed compact double array (XCDA) entries
 800            uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
 801            charsmap_offset += sizeof(xcda_blob_size);
 802            if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
 803                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
 804            }
 805
 806            // Next xcda_blob_size bytes contain entries of XOR-compressed compact
 807            // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
 808            xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
 809            xcda_array_size = xcda_blob_size / sizeof(uint32_t);
 810            charsmap_offset += xcda_blob_size;
 811
 812            // Remaining bytes of precompiled charsmap contain null-terminated
 813            // replacement strings for prefixes matched by the XCDA.
 814            prefix_replacements = &precompiled_charsmap[charsmap_offset];
 815            prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
 816        }
 817
 818        for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
 819            const auto & token_data = vocab.get_token_data(id);
 820
 821            if (vocab.is_normal(id)) {
 822                min_score = std::min<float>(min_score, token_data.score);
 823                max_score = std::max<float>(max_score, token_data.score);
 824            }
 825
 826            if (vocab.is_normal(id) ||
 827                vocab.is_user_defined(id) ||
 828                vocab.is_unused(id)) {
 829                token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
 830            }
 831
 832            if (vocab.is_user_defined(id)) {
 833                user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
 834            }
 835        }
 836
 837        unknown_token_score = min_score - unknown_token_score_penalty;
 838    }
 839
 840    // escaped space symbol - U+2581 (Lower One Eighth Block)
 841    const std::string escaped_space = "\xE2\x96\x81";
 842
 843    const char * prefix_replacements = NULL;
 844    size_t prefix_replacements_size = 0;
 845
 846    const uint32_t * xcda_array = NULL;
 847    size_t xcda_array_size = 0;
 848
 849    struct naive_trie user_defined_token_matcher;
 850
 851    float min_score = FLT_MAX;
 852    float max_score = -FLT_MAX;
 853
 854    float unknown_token_score_penalty = 10.0;
 855    float unknown_token_score;
 856
 857    struct naive_trie token_matcher;
 858};
 859
 860struct llm_tokenizer_ugm_session {
 861    llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
 862
 863    /* This implementation is based on SentencePiece optimized Viterbi algorithm for
 864     * unigram language models. The general idea is to:
 865     * - move along the input sequence in steps of one UTF code point,
 866     * - at each step find all possible tokenizations of the prefix by
 867     *   traversing the tokens trie,
 868     * - for each tokenization store the best one so far (by higher score)
 869     * - use the position in sequence after given token as an index to store
 870     *   results
 871     * - if there was no valid tokenization of the current UTF code point
 872     *   then use unknown token with additional score penalty
 873     * After processing the whole sequence we backtrack from the end to get
 874     * the best tokenization.
 875    */
 876    void tokenize(const std::string & text, std::vector<llama_token> & output) {
 877        // get current size of output (for reversal later)
 878        size_t output_size = output.size();
 879
 880        // normalize the input first
 881        std::string normalized;
 882        normalize(text, &normalized);
 883        size_t input_len = normalized.size();
 884        if (input_len == 0) {
 885            return;
 886        }
 887
 888        // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
 889        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
 890        // at the beginning tokenization score is zero
 891        tokenization_results[0] = { vocab.token_unk(), 0, 0 };
 892
 893        for (size_t input_offset = 0; input_offset < input_len;) {
 894            size_t prefix_offset = input_offset;
 895            // calculate how many code units are in the currently processed UTF code point
 896            size_t n_utf8_code_units = std::min<size_t>(unicode_len_utf8(normalized[input_offset]), input_len - input_offset);
 897
 898            // traverse the token matcher trie to find a matching token
 899            bool single_codepoint_token_found = false;
 900            const struct best_tokenization & current_best = tokenization_results[input_offset];
 901            const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]);
 902
 903            while (prefix_offset <= input_len && node != NULL) {
 904                // check if we found valid token in prefix
 905                if (node->has_value) {
 906                    // check if it corresponds to the whole UTF code point
 907                    if (prefix_offset - input_offset == n_utf8_code_units) {
 908                        single_codepoint_token_found = true;
 909                    }
 910                    llama_token token_id = node->value;
 911                    const auto & token_data = vocab.get_token_data(token_id);
 912
 913                    // we set the user-defined token scores to 0 to make them more likely to be selected
 914                    // (normal token scores are log probabilities, so they are negative)
 915                    // score type is double here to make tokenization results exactly
 916                    // the same as in the HF tokenizer using SentencePiece
 917                    const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score;
 918                    const double challenger_score = current_best.score_sum + token_score;
 919                    struct best_tokenization & current_champ = tokenization_results[prefix_offset];
 920                    if (challenger_score > current_champ.score_sum) {
 921                        struct best_tokenization challenger = { token_id, input_offset, challenger_score };
 922                        current_champ = challenger;
 923                    }
 924                }
 925                node = node->traverse(normalized[prefix_offset++]);
 926            }
 927
 928            // if we didn't find a valid token corresponding to the whole UTF code point
 929            // then use unknown token as the tokenization of this UTF code point
 930            if (!single_codepoint_token_found) {
 931                const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
 932                prefix_offset = input_offset + n_utf8_code_units;
 933                struct best_tokenization & current_champ = tokenization_results[prefix_offset];
 934                if (challenger_score > current_champ.score_sum) {
 935                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
 936                    current_champ = challenger;
 937                }
 938            }
 939
 940            // move to the next UTF code point
 941            input_offset += n_utf8_code_units;
 942        }
 943
 944        // now backtrack from the end to gather token ids of the best tokenization
 945        // merge sequences of consecutive unknown tokens into single unknown tokens
 946        bool is_prev_unknown = false;
 947        for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
 948            bool is_unknown = tokenization.token_id == vocab.token_unk();
 949            if (!(is_prev_unknown && is_unknown)) {
 950                output.push_back(tokenization.token_id);
 951            }
 952            if (tokenization.input_offset == 0) {
 953                break;
 954            }
 955            is_prev_unknown = is_unknown;
 956        }
 957
 958        // reverse the output since we added tokens starting from the end of the input
 959        std::reverse(output.begin() + output_size, output.end());
 960    }
 961
 962private:
 963
 964    // helper structure for returning normalization results
 965    struct normalization_result {
 966        const char * normalized;
 967        size_t normalized_len;
 968        size_t consumed_input;
 969    };
 970
 971    void normalize(const std::string& input, std::string * normalized) {
 972        normalized->clear();
 973        normalized->reserve(input.size() * 3);
 974
 975        const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
 976
 977        const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
 978        const bool shall_append_space  =  vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
 979        const bool shall_merge_spaces  =  vocab.get_remove_extra_whitespaces();
 980
 981        bool is_space_prepended = false;
 982        bool processing_non_ws = false;
 983
 984        size_t input_len = input.size();
 985
 986        for (size_t input_offset = 0; input_offset < input_len; ) {
 987            auto norm_res = normalize_prefix(input, input_offset);
 988            for (size_t i = 0; i < norm_res.normalized_len; i++) {
 989                char c = norm_res.normalized[i];
 990                if (c != ' ') {
 991                    if (!processing_non_ws) {
 992                        processing_non_ws = true;
 993                        if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
 994                            normalized->append(space);
 995                            is_space_prepended = true;
 996                        }
 997                    }
 998                    normalized->push_back(c);
 999                } else {
1000                    if (processing_non_ws) {
1001                        processing_non_ws = false;
1002                    }
1003                    if (!shall_merge_spaces) {
1004                        normalized->append(space);
1005                    }
1006                }
1007            }
1008
1009            input_offset += norm_res.consumed_input;
1010        }
1011
1012        if (shall_append_space) {
1013            normalized->append(space);
1014        }
1015    }
1016
1017    /*
1018     * This structure is a view wrapper for XOR-compressed double array (XCDA)
1019     * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
1020     * Each bit-packed entry contains:
1021     * - BASE array value in bits 10-30
1022     * - LCHECK array value in bits 0-7
1023     * - LEAF array value in bit 9
1024     * Entries containing indexes of replacement sequences have set bit 31
1025     */
1026    struct xcda_array_view {
1027    public:
1028        xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
1029        }
1030        uint32_t get_base(size_t index) {
1031            uint32_t packed_node = get_node(index);
1032            return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
1033        }
1034        uint32_t get_lcheck(size_t index) {
1035            uint32_t packed_node = get_node(index);
1036            return packed_node & ((1U << 31) | 0xff);
1037        }
1038        bool get_leaf(size_t index) {
1039            uint32_t packed_node = get_node(index);
1040            return (packed_node >> 8) & 1;
1041        }
1042        uint32_t get_value(size_t index) {
1043            uint32_t packed_node = get_node(index);
1044            return packed_node & ((1U << 31) - 1);
1045        }
1046    private:
1047        uint32_t get_node(size_t index) {
1048            if (index >= xcda_array_size) {
1049                throw std::runtime_error("Index out of array bounds in XCDA array!");
1050            }
1051            return xcda_array[index];
1052        }
1053        const uint32_t * xcda_array;
1054        size_t xcda_array_size;
1055    };
1056
1057    // this structure stores the best tokenization so far at input_offset
1058    struct best_tokenization {
1059        llama_token token_id;
1060        size_t input_offset;
1061        double score_sum;
1062    };
1063
1064    struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
1065        if (input_offset == input.size()) {
1066            return { &input[input_offset], 0, 0 };
1067        }
1068
1069        // if input prefix matches some user-defined token return this token as normalization result
1070        auto user_defined_token_match =
1071           tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
1072        if (user_defined_token_match.second > 0) {
1073            return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
1074        }
1075
1076        size_t longest_prefix_length = 0;
1077        size_t longest_prefix_offset = 0;
1078
1079        if (tokenizer.xcda_array_size > 0) {
1080            struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
1081
1082            // Find the longest normalized sequence matching the input prefix by walking
1083            // the XOR-compressed compact double array (XCDA) starting from the root node
1084            // We find the index of the next node by calculating BASE[s] ^ c where s is
1085            // the index of the previous node and c is a numerical character value
1086            uint32_t node_index = 0;
1087            // get BASE of the root node
1088            node_index = xcda_view.get_base(node_index);
1089            for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
1090                unsigned char c = input[prefix_offset];
1091                if (c == 0) {
1092                    break;
1093                }
1094                node_index ^= c;
1095                // if value of LCHECK is not c it means that this is not a child of
1096                // the previous node, so we stop matching
1097                if (xcda_view.get_lcheck(node_index) != c) {
1098                    break;
1099                }
1100                bool is_leaf = xcda_view.get_leaf(node_index);
1101                // get BASE of the current node
1102                node_index ^= xcda_view.get_base(node_index);
1103                // if LEAF of the current node is true, it means that its BASE points to the node
1104                // containing index of replacement sequence for currently matched input prefix
1105                if (is_leaf)
1106                {
1107                    longest_prefix_length = prefix_offset - input_offset + 1;
1108                    // get index of replacement sequence for currently matched input prefix
1109                    longest_prefix_offset = xcda_view.get_value(node_index);
1110                }
1111            }
1112        }
1113
1114        if (longest_prefix_length > 0) {
1115            // we have a match, so return the replacement sequence
1116            if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
1117                throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
1118            }
1119            const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
1120            return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
1121        }
1122
1123        // check if the input prefix contains a valid sequence of UTF-8 code units
1124        try {
1125            // if yes, return this sequence unmodified
1126            size_t prefix_offset = input_offset;
1127            unicode_cpt_from_utf8(input, prefix_offset);
1128            return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
1129        } catch (std::invalid_argument & /*ex*/) {
1130            // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
1131            return { "\xEF\xBF\xBD", 3, 1 };
1132        }
1133    }
1134
1135    const llama_vocab & vocab;
1136    const llm_tokenizer_ugm & tokenizer;
1137};
1138
1139//
1140// RWKV tokenizer
1141//
1142
1143static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
1144    std::vector<uint8_t> output;
1145    output.reserve(escaped.size());
1146
1147    // Parser state
1148    bool escaping = false;
1149    uint8_t hex_remaining = 0;
1150    uint8_t hex_acc = 0;
1151
1152    // Step through characters, performing parsing
1153    for (const char & c : escaped) {
1154        // If we're parsing a hex code, interpret the next character
1155        if (hex_remaining != 0) {
1156            uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0');
1157            hex_acc = (hex_acc << 4) + value;
1158
1159            hex_remaining -= 1;
1160            if (hex_remaining == 0) {
1161                output.push_back(hex_acc);
1162                hex_acc = 0;
1163            }
1164
1165            continue;
1166        }
1167
1168        // If we got an escape character, interpret it
1169        if (escaping) {
1170            if (c == 't') {
1171                output.push_back('\t');
1172            } else if (c == 'n') {
1173                output.push_back('\n');
1174            } else if (c == 'r') {
1175                output.push_back('\r');
1176            } else if (c == 'x') {
1177                hex_remaining = 2;
1178            } else {
1179                output.push_back(c);
1180            }
1181
1182            escaping = false;
1183            continue;
1184        }
1185
1186        if (c == '\\') {
1187            escaping = true;
1188            continue;
1189        }
1190
1191        output.push_back(c);
1192    }
1193
1194    return output;
1195}
1196
1197struct llm_tokenizer_rwkv : llm_tokenizer {
1198    llm_tokenizer_rwkv(const llama_vocab & vocab) {
1199        // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
1200        // For now, we decode the vocab here into the lookup we'll use for tokenization.
1201
1202        // build trie
1203        for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
1204            const auto & data = vocab.get_token_data(id);
1205            const auto text = llama_unescape_rwkv_token(data.text);
1206            token_matcher.insert((const char *) text.data(), text.size(), id);
1207        }
1208    }
1209
1210    struct naive_trie token_matcher;
1211};
1212
1213struct llm_tokenizer_rwkv_session {
1214    llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
1215
1216    void tokenize(const std::string & text, std::vector<llama_token> & output) {
1217        uint32_t position = 0;
1218        while (position < text.size()) {
1219            const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]);
1220            if (node == NULL) {
1221                // no matching token found, add unknown token
1222                output.push_back(vocab.token_unk());
1223                position += 1;
1224                continue;
1225            }
1226
1227            // traverse the trie to find the longest matching token
1228            uint32_t token_id = 0;
1229            uint32_t token_length = 0;
1230            while (node != NULL) {
1231                if (node->has_value) {
1232                    token_id = node->value;
1233                    token_length = position + 1;
1234                }
1235                node = node->traverse(text[++position]);
1236            }
1237
1238            // add the longest matching token
1239            output.push_back(token_id);
1240            position = token_length;
1241        }
1242    }
1243
1244private:
1245    const llama_vocab & vocab;
1246    const llm_tokenizer_rwkv & tokenizer;
1247};
1248
1249struct llm_tokenizer_plamo2 : llm_tokenizer {
1250    llm_tokenizer_plamo2(const llama_vocab & vocab) {
1251        build(vocab);
1252    }
1253
1254    void build(const llama_vocab & vocab) {
1255        // Reset internal structures
1256        tokens_.clear();
1257        bytes_.assign(256, 0);
1258        to_suffix_id_.clear();
1259        table_.clear();
1260
1261        // Build token list and byte mapping
1262        std::unordered_map<std::string, float> suffix_to_score;
1263        std::unordered_map<std::string, llama_token> token_to_id;
1264
1265        for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
1266            const auto & entry = vocab.get_token_data(token_id);
1267            tokens_.push_back(entry.text);
1268            token_to_id[entry.text] = static_cast<llama_token>(token_id);
1269
1270            // Handle byte tokens
1271            if (vocab.is_byte(token_id)) {
1272                if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
1273                    std::string hex_str = entry.text.substr(3, 2);
1274                    int byte_val = std::stoi(hex_str, nullptr, 16);
1275                    bytes_[byte_val] = static_cast<llama_token>(token_id);
1276                }
1277                continue;
1278            }
1279
1280            // Add token and all its suffixes to suffix_to_score
1281            suffix_to_score[entry.text] = entry.score;
1282
1283            // Extract suffixes character by character (UTF-8 aware)
1284            std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
1285            for (size_t i = 1; i < cpts.size(); ++i) {
1286                std::string suffix;
1287                for (size_t j = i; j < cpts.size(); ++j) {
1288                    suffix += unicode_cpt_to_utf8(cpts[j]);
1289                }
1290                if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
1291                    suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
1292                }
1293            }
1294        }
1295
1296        // Check that all byte tokens are set
1297        for (int i = 0; i < 256; ++i) {
1298            if (bytes_[i] == 0) {
1299                throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
1300            }
1301        }
1302
1303        // Build suffix list in lexicographical order of reversed strings
1304        std::vector<std::string> suffixes;
1305        suffixes.reserve(suffix_to_score.size() + 1);
1306        for (const auto & pair : suffix_to_score) {
1307            suffixes.push_back(pair.first);
1308        }
1309        suffixes.push_back("");  // Empty suffix
1310
1311        std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
1312            std::string rev_a(a.rbegin(), a.rend());
1313            std::string rev_b(b.rbegin(), b.rend());
1314            return rev_a < rev_b;
1315        });
1316
1317        // Build suffix_to_id and to_suffix_id_
1318        std::unordered_map<std::string, int32_t> suffix_to_id;
1319        int32_t num_pieces = 0;
1320
1321        for (const auto & suffix : suffixes) {
1322            suffix_to_id[suffix] = num_pieces;
1323            if (!suffix.empty()) {
1324                std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1325
1326                std::string remaining;
1327                for (size_t i = 1; i < cpts.size(); ++i) {
1328                    remaining += unicode_cpt_to_utf8(cpts[i]);
1329                }
1330
1331                int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
1332                to_suffix_id_[piece_code] = num_pieces;
1333
1334                // Count number of pieces for this suffix
1335                int32_t pieces_for_suffix = 1; // sentinel row
1336                for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1337                    std::string piece;
1338                    for (int32_t i = 0; i < piece_length; ++i) {
1339                        piece += unicode_cpt_to_utf8(cpts[i]);
1340                    }
1341                    if (suffix_to_score.find(piece) != suffix_to_score.end()) {
1342                        pieces_for_suffix++;
1343                    }
1344                }
1345                num_pieces += pieces_for_suffix;
1346            } else {
1347                num_pieces++;  // Empty suffix contributes one piece (sentinel row)
1348            }
1349        }
1350
1351        // Build flattened table
1352        table_.resize(num_pieces, std::vector<int32_t>(4, 0));
1353        int32_t table_idx = 0;
1354
1355        for (const auto & suffix : suffixes) {
1356            // Add all prefixes of the suffix to the table (in decreasing order of length)
1357            std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1358            for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1359                std::string piece;
1360                for (int32_t i = 0; i < piece_length; ++i) {
1361                    piece += unicode_cpt_to_utf8(cpts[i]);
1362                }
1363
1364                auto score_it = suffix_to_score.find(piece);
1365                if (score_it == suffix_to_score.end()) {
1366                    continue;
1367                }
1368
1369                table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
1370                auto token_it = token_to_id.find(piece);
1371                table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
1372
1373                float score = score_it->second;
1374                table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
1375                    static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
1376                table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
1377
1378                table_idx++;
1379            }
1380
1381            // Add sentinel row
1382            table_[table_idx][TABLE_PIECE_LENGTH] = 1;
1383            table_[table_idx][TABLE_TOKEN_ID] = -1;
1384            table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
1385            table_idx++;
1386        }
1387    }
1388
1389    std::vector<llama_token> encode(const std::string & text) const {
1390        std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
1391        // Skip the first code point if it is a BOM (Byte Order Mark)
1392        if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
1393            unicode_data.erase(unicode_data.begin());
1394        }
1395
1396        if (unicode_data.empty()) {
1397            return {};
1398        }
1399
1400        const size_t data_len = unicode_data.size();
1401
1402        // Initialize scores array (dynamic programming)
1403        std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
1404        scores[data_len] = 0;
1405
1406        // Path array to track best tokenization
1407        std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
1408
1409        int32_t suffix_id = 0;
1410
1411        // Process from end to beginning
1412        for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
1413            uint32_t c = unicode_data[i];
1414
1415            // Find next suffix ID
1416            for (size_t p = suffix_id; p < table_.size(); ++p) {
1417                int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
1418                auto it = to_suffix_id_.find(piece_code);
1419                suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
1420
1421                if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
1422                    break;
1423                }
1424            }
1425
1426            // Update best path
1427            for (size_t p = suffix_id; p < table_.size(); ++p) {
1428                int32_t score = table_[p][TABLE_SCORE];
1429                if (score > INVALID_SCORE) {
1430                    int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
1431                    int64_t s = scores[i + piece_length] - score;
1432
1433                    if (s < scores[i]) {
1434                        scores[i] = s;
1435                        path[i][PATH_TOKEN_LENGTH] = piece_length;
1436                        path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
1437                        path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
1438
1439                        if (score == UNKNOWN_SCORE) {
1440                            // Add UTF-8 byte count
1441                            path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1442                        }
1443                    }
1444                }
1445
1446                if (score == UNKNOWN_SCORE) {
1447                    break;
1448                }
1449            }
1450        }
1451
1452        // Decode the best path
1453        std::vector<llama_token> token_ids;
1454        token_ids.reserve(path[0][PATH_NUM_TOKENS]);
1455
1456        int pos = 0;
1457        while (pos < static_cast<int>(data_len)) {
1458            if (path[pos][PATH_TOKEN_ID] >= 0) {
1459                token_ids.push_back(path[pos][PATH_TOKEN_ID]);
1460            } else {
1461                // Fall back to byte tokens
1462                uint32_t c = unicode_data[pos];
1463                int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1464
1465                for (int i = 0; i < s; ++i) {
1466                    uint8_t b;
1467                    if (s == 1) {
1468                        b = c;
1469                    } else {
1470                        if (i == 0) {
1471                            b = (0xF00 >> s) & 0xFF;
1472                        } else {
1473                            b = 0x80;
1474                        }
1475                    }
1476                    token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
1477                }
1478            }
1479
1480            assert(path[pos][PATH_TOKEN_LENGTH] > 0);
1481            pos += path[pos][PATH_TOKEN_LENGTH];
1482        }
1483
1484        return token_ids;
1485    }
1486private:
1487    // Constants for table structure
1488    static constexpr int32_t TABLE_PIECE_LENGTH = 0;
1489    static constexpr int32_t TABLE_TOKEN_ID     = 1;
1490    static constexpr int32_t TABLE_SCORE        = 2;
1491    static constexpr int32_t TABLE_PIECE_ID     = 3;
1492
1493    // Constants for path array
1494    static constexpr int32_t PATH_TOKEN_LENGTH  = 0;
1495    static constexpr int32_t PATH_TOKEN_ID      = 1;
1496    static constexpr int32_t PATH_NUM_TOKENS    = 2;
1497
1498    // Score constants
1499    static constexpr int32_t INVALID_SCORE = -20000000;
1500    static constexpr int32_t UNKNOWN_SCORE = -10000000;
1501
1502    // List of tokens in the vocabulary
1503    std::vector<std::string> tokens_;
1504
1505    // Mapping from byte code point to token ID (for byte fallback)
1506    std::vector<llama_token> bytes_;
1507
1508    // Mapping from piece code to suffix ID
1509    std::unordered_map<int64_t, int32_t> to_suffix_id_;
1510
1511    // Flattened table representing the Trie structure
1512    // Each row contains: [piece_length, token_id, score, piece_id]
1513    std::vector<std::vector<int32_t>> table_;
1514};
1515
1516struct llm_tokenizer_plamo2_session {
1517    llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
1518
1519    void tokenize(const std::string & text, std::vector<llama_token> & output) {
1520        std::vector<llama_token> tokens = tokenizer.encode(text);
1521        output.insert(output.end(), tokens.begin(), tokens.end());
1522    }
1523
1524private:
1525    const llm_tokenizer_plamo2 & tokenizer;
1526};
1527
1528//
1529// impl
1530//
1531
1532typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
1533    FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
1534    FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
1535} FRAGMENT_BUFFER_VARIANT_TYPE;
1536
1537struct fragment_buffer_variant {
1538    fragment_buffer_variant(llama_token _token)
1539    :
1540        type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
1541        token(_token),
1542        raw_text(_dummy),
1543        offset(0),
1544        length(0) {}
1545
1546    fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
1547    :
1548        type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
1549        token((llama_token) - 1),
1550        raw_text(_raw_text),
1551        offset(_offset),
1552        length(_length){
1553            GGML_ASSERT(_offset >= 0);
1554            GGML_ASSERT(_length >= 1);
1555            GGML_ASSERT(offset + length <= raw_text.length());
1556        }
1557
1558    const FRAGMENT_BUFFER_VARIANT_TYPE type;
1559    const llama_token token;
1560    const std::string _dummy;
1561    const std::string & raw_text;
1562    const uint64_t offset;
1563    const uint64_t length;
1564};
1565
1566struct llama_vocab::impl {
1567    uint32_t n_token_types = 0; // for BERT-style token types
1568
1569    std::string tokenizer_model;
1570    std::string tokenizer_pre;
1571
1572    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
1573    enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1574
1575    int max_token_len = 0; // used for optimizing longest token search
1576
1577    // default LLaMA special tokens
1578    // TODO: should we set all of these to LLAMA_TOKEN_NULL?
1579    llama_token special_bos_id  = 1;
1580    llama_token special_eos_id  = 2;
1581    llama_token special_eot_id  = LLAMA_TOKEN_NULL;
1582    llama_token special_eom_id  = LLAMA_TOKEN_NULL;
1583    llama_token special_unk_id  = 0;
1584    llama_token special_sep_id  = LLAMA_TOKEN_NULL;
1585    llama_token special_pad_id  = LLAMA_TOKEN_NULL;
1586    llama_token special_mask_id = LLAMA_TOKEN_NULL;
1587
1588    llama_token linefeed_id = 13;
1589
1590    // fim tokens
1591    llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
1592    llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
1593    llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
1594    llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
1595    llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
1596    llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
1597
1598    // tokenizer flags
1599    bool add_space_prefix           = false;
1600    bool add_bos                    = false;
1601    bool add_eos                    = false;
1602    bool add_sep                    = false;
1603    bool ignore_merges              = false;
1604    bool clean_spaces               = false;  // clean_up_tokenization_spaces
1605    bool remove_extra_whitespaces   = false;
1606    bool escape_whitespaces         = true;
1607    bool treat_whitespace_as_suffix = false;
1608
1609    std::unordered_map<std::string, llama_token> token_to_id;
1610    std::vector<token_data>                      id_to_token;
1611
1612    std::vector<llama_token> cache_special_tokens;
1613    std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
1614    struct pair_hash {
1615        size_t operator()(const std::pair<std::string, std::string> & p) const {
1616            return std::hash<std::string>{}(p.first) ^  //create some hash for pair
1617                   (std::hash<std::string>{}(p.second) << 1);
1618        }
1619    };
1620    std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
1621
1622    // set of all tokens that cause "end of generation"
1623    std::set<llama_token> special_eog_ids;
1624
1625    std::unique_ptr<llm_tokenizer> tokenizer;
1626
1627    std::vector<char> precompiled_charsmap;
1628
1629    impl(const llama_vocab & vocab) : vocab(vocab) {
1630    }
1631
1632    ~impl() = default;
1633
1634    void load(llama_model_loader & ml, const LLM_KV & kv);
1635
1636    enum llama_vocab_type get_type() const;
1637
1638    std::string type_name() const;
1639
1640    bool is_normal      (llama_token id) const;
1641    bool is_unknown     (llama_token id) const;
1642    bool is_control     (llama_token id) const;
1643    bool is_byte        (llama_token id) const;
1644    bool is_user_defined(llama_token id) const;
1645    bool is_unused      (llama_token id) const;
1646    bool is_eog         (llama_token id) const;
1647
1648    uint8_t token_to_byte(llama_token id) const;
1649
1650    llama_token_attr token_get_attr(llama_token id) const;
1651
1652    void init_tokenizer(enum llama_vocab_type type);
1653
1654    void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
1655
1656    std::string token_to_piece_for_cache(
1657                  llama_token   token,
1658                         bool   special) const;
1659
1660
1661    std::vector<llama_token> tokenize(
1662            const std::string & raw_text,
1663                         bool   add_special,
1664                         bool   parse_special = false) const;
1665
1666    int32_t tokenize(
1667                   const char * text,
1668                      int32_t   text_len,
1669                  llama_token * tokens,
1670                      int32_t   n_tokens_max,
1671                         bool   add_special,
1672                         bool   parse_special) const;
1673
1674    // does not write null-terminator to buf
1675    int32_t token_to_piece(
1676                  llama_token   token,
1677                         char * buf,
1678                      int32_t   length,
1679                      int32_t   lstrip,
1680                         bool   special) const;
1681
1682    // use cached data
1683    const std::string & token_to_piece(llama_token token) const;
1684
1685    int32_t detokenize(
1686            const llama_token * tokens,
1687                      int32_t   n_tokens,
1688                         char * text,
1689                      int32_t   text_len_max,
1690                         bool   remove_special,
1691                         bool   unparse_special) const;
1692
1693    std::string detokenize(
1694            const std::vector<llama_token> & tokens,
1695                                      bool   special) const;
1696
1697    void print_info() const;
1698
1699private:
1700    const llama_vocab & vocab;
1701};
1702
1703void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1704    struct gguf_context * ctx = ml.meta.get();
1705
1706    // determine vocab type
1707    {
1708        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
1709        ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
1710
1711        ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
1712
1713        if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
1714            type = LLAMA_VOCAB_TYPE_NONE;
1715
1716            // default special tokens
1717            special_bos_id  = LLAMA_TOKEN_NULL;
1718            special_eos_id  = LLAMA_TOKEN_NULL;
1719            special_unk_id  = LLAMA_TOKEN_NULL;
1720            special_sep_id  = LLAMA_TOKEN_NULL;
1721            special_pad_id  = LLAMA_TOKEN_NULL;
1722            special_mask_id = LLAMA_TOKEN_NULL;
1723            linefeed_id     = LLAMA_TOKEN_NULL;
1724
1725            // read vocab size from metadata
1726            uint32_t n_tokens = 0;
1727            if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
1728                LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
1729                id_to_token.resize(n_tokens);
1730            }
1731
1732            return;
1733        }
1734
1735        if (tokenizer_model == "llama") {
1736            type = LLAMA_VOCAB_TYPE_SPM;
1737
1738            // default special tokens
1739            special_bos_id  = 1;
1740            special_eos_id  = 2;
1741            special_unk_id  = 0;
1742            special_sep_id  = LLAMA_TOKEN_NULL;
1743            special_pad_id  = LLAMA_TOKEN_NULL;
1744            special_mask_id = LLAMA_TOKEN_NULL;
1745        } else if (tokenizer_model == "bert") {
1746            type = LLAMA_VOCAB_TYPE_WPM;
1747
1748            // default special tokens
1749            special_bos_id  = 101;
1750            special_eos_id  = LLAMA_TOKEN_NULL;
1751            special_unk_id  = 100;
1752            special_sep_id  = 102;
1753            special_pad_id  = 0;
1754            special_mask_id = 103;
1755
1756            add_sep = true;
1757        } else if (tokenizer_model == "gpt2") {
1758            type = LLAMA_VOCAB_TYPE_BPE;
1759
1760            // read bpe merges and populate bpe ranks
1761            const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
1762            // Kimi-K2 uses custom tokenization without traditional BPE merges
1763            const bool is_kimi_k2 = (tokenizer_pre == "kimi-k2");
1764
1765            if (merges_keyidx == -1) {
1766                if (!is_kimi_k2) {
1767                    throw std::runtime_error("cannot find tokenizer merges in model file\n");
1768                }
1769                // Kimi-K2 doesn't need merges, skip
1770                LLAMA_LOG_INFO("%s: Kimi-K2 tokenizer detected, skipping BPE merges\n", __func__);
1771            } else {
1772                const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
1773                for (int i = 0; i < n_merges; i++) {
1774                    const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
1775                    //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
1776
1777                    std::string first;
1778                    std::string second;
1779
1780                    const size_t pos = word.find(' ', 1);
1781
1782                    if (pos != std::string::npos) {
1783                        first  = word.substr(0, pos);
1784                        second = word.substr(pos + 1);
1785                    }
1786
1787                    bpe_ranks.emplace(std::make_pair(first, second), i);
1788                }
1789            }
1790
1791            // default special tokens
1792            special_bos_id  = 11;
1793            special_eos_id  = 11;
1794            special_unk_id  = LLAMA_TOKEN_NULL;
1795            special_sep_id  = LLAMA_TOKEN_NULL;
1796            special_pad_id  = LLAMA_TOKEN_NULL;
1797            special_mask_id = LLAMA_TOKEN_NULL;
1798        } else if (tokenizer_model == "t5") {
1799            type = LLAMA_VOCAB_TYPE_UGM;
1800
1801            // default special tokens
1802            special_bos_id  = LLAMA_TOKEN_NULL;
1803            special_eos_id  = 1;
1804            special_unk_id  = 2;
1805            special_sep_id  = LLAMA_TOKEN_NULL;
1806            special_pad_id  = 0;
1807            special_mask_id = LLAMA_TOKEN_NULL;
1808
1809            const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
1810            if (precompiled_charsmap_keyidx != -1) {
1811                const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
1812                GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
1813
1814                const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1815                const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1816                precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1817#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1818                // correct endiannes of data in precompiled_charsmap binary blob
1819                uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
1820                *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
1821                assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
1822                size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
1823                uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
1824                for (size_t i = 0; i < xcda_array_size; ++i) {
1825                    xcda_array[i] = __builtin_bswap32(xcda_array[i]);
1826                }
1827#endif
1828            }
1829        } else if (tokenizer_model == "rwkv") {
1830            type = LLAMA_VOCAB_TYPE_RWKV;
1831
1832            // default special tokens
1833            special_bos_id = LLAMA_TOKEN_NULL;
1834            special_eos_id = LLAMA_TOKEN_NULL;
1835            special_unk_id = LLAMA_TOKEN_NULL;
1836            special_sep_id = LLAMA_TOKEN_NULL;
1837            special_pad_id = LLAMA_TOKEN_NULL;
1838        } else if (tokenizer_model == "plamo2") {
1839            type = LLAMA_VOCAB_TYPE_PLAMO2;
1840
1841            // PLaMo-2 default special tokens (these will be overridden by model config)
1842            special_bos_id = 1;  // <|plamo:bos|>
1843            special_eos_id = 2;  // <|plamo:eos|>
1844            special_unk_id = 0;  // <|plamo:unk|>
1845            special_sep_id = LLAMA_TOKEN_NULL;
1846            special_pad_id = 3;  // <|plamo:pad|>
1847            special_mask_id = LLAMA_TOKEN_NULL;
1848        } else {
1849            throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
1850        }
1851
1852        // for now, only BPE models have pre-tokenizers
1853        if (type == LLAMA_VOCAB_TYPE_BPE) {
1854            add_space_prefix = false;
1855            clean_spaces = true;
1856            if (tokenizer_pre.empty()) {
1857                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
1858                LLAMA_LOG_WARN("%s:                                             \n", __func__);
1859                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
1860                LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
1861                LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
1862                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
1863                LLAMA_LOG_WARN("%s:                                             \n", __func__);
1864                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1865            } else if (tokenizer_pre == "default") {
1866                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1867            } else if (
1868                    tokenizer_pre == "llama3"   ||
1869                    tokenizer_pre == "llama-v3" ||
1870                    tokenizer_pre == "llama-bpe"||
1871                    tokenizer_pre == "falcon3"  ||
1872                    tokenizer_pre == "falcon-h1" ||
1873                    tokenizer_pre == "pixtral"  ||
1874                    tokenizer_pre == "midm-2.0" ||
1875                    tokenizer_pre == "lfm2") {
1876                pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1877                ignore_merges = true;
1878                add_bos = true;
1879            } else if (
1880                    tokenizer_pre == "deepseek-llm") {
1881                pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
1882                clean_spaces = false;
1883            } else if (
1884                    tokenizer_pre == "deepseek-coder") {
1885                pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
1886                clean_spaces = false;
1887            } else if (
1888                    tokenizer_pre == "deepseek-v3") {
1889                pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
1890                clean_spaces = false;
1891            } else if (
1892                    tokenizer_pre == "youtu") {
1893                pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
1894                clean_spaces = false;
1895                ignore_merges = true;
1896            } else if (
1897                    tokenizer_pre == "falcon") {
1898                pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
1899            } else if (
1900                    tokenizer_pre == "mpt") {
1901                pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
1902            } else if (
1903                    tokenizer_pre == "starcoder") {
1904                pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
1905            } else if (
1906                    tokenizer_pre == "gpt-2"   ||
1907                    tokenizer_pre == "phi-2"   ||
1908                    tokenizer_pre == "jina-es" ||
1909                    tokenizer_pre == "jina-de" ||
1910                    tokenizer_pre == "gigachat"   ||
1911                    tokenizer_pre == "jina-v2-es" ||
1912                    tokenizer_pre == "jina-v2-de" ||
1913                    tokenizer_pre == "a.x-4.0" ||
1914                    tokenizer_pre == "mellum"  ||
1915                    tokenizer_pre == "modern-bert" ) {
1916                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1917            } else if (
1918                    tokenizer_pre == "jina-v1-en" ||
1919                    tokenizer_pre == "jina-v2-code" ||
1920                    tokenizer_pre == "roberta-bpe") {
1921                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1922                add_sep = true;
1923            } else if (
1924                    tokenizer_pre == "refact") {
1925                pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
1926            } else if (
1927                tokenizer_pre == "command-r") {
1928                pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
1929                clean_spaces = false;
1930            } else if (
1931                    tokenizer_pre == "qwen2" ||
1932                    tokenizer_pre == "deepseek-r1-qwen" ||
1933                    tokenizer_pre == "kormo") {
1934                pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1935                clean_spaces = false;
1936            } else if (
1937                    tokenizer_pre == "qwen35") {
1938                pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN35;
1939                clean_spaces = false;
1940            } else if (
1941                tokenizer_pre == "stablelm2") {
1942                pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
1943            } else if (
1944                tokenizer_pre == "olmo") {
1945                pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
1946            } else if (
1947                tokenizer_pre == "dbrx") {
1948                pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
1949            } else if (
1950                tokenizer_pre == "smaug-bpe") {
1951                pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
1952            } else if (
1953                tokenizer_pre == "poro-chat") {
1954                pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
1955                clean_spaces = false;
1956            } else if (
1957                tokenizer_pre == "glm4" ||
1958                tokenizer_pre == "chatglm-bpe") {
1959                pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
1960                special_bos_id = LLAMA_TOKEN_NULL;
1961            } else if (
1962                tokenizer_pre == "viking") {
1963                pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
1964                clean_spaces = false;
1965            } else if (
1966                tokenizer_pre == "jais") {
1967                pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
1968            } else if (
1969                tokenizer_pre == "tekken") {
1970                pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
1971                clean_spaces = false;
1972                ignore_merges = true;
1973                add_bos = true;
1974            } else if (
1975                tokenizer_pre == "smollm") {
1976                pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
1977                clean_spaces = false;
1978            } else if (
1979                tokenizer_pre == "codeshell") {
1980                pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
1981            } else if (
1982                tokenizer_pre == "bloom") {
1983                pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
1984            } else if (
1985                tokenizer_pre == "gpt3-finnish") {
1986                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
1987            } else if (
1988                tokenizer_pre == "exaone") {
1989                pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1990            } else if (
1991                tokenizer_pre == "exaone4") {
1992                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1993            } else if (
1994                tokenizer_pre == "exaone-moe") {
1995                pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
1996            } else if (
1997                tokenizer_pre == "chameleon") {
1998                pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
1999                add_bos = true;
2000                clean_spaces = false;
2001            } else if (
2002                tokenizer_pre == "minerva-7b") {
2003                pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
2004            } else if (
2005                tokenizer_pre == "megrez") {
2006                pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
2007            } else if (
2008                    tokenizer_pre == "gpt-4o" ||
2009                    tokenizer_pre == "llama4") {
2010                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
2011                clean_spaces = false;
2012            } else if (
2013                tokenizer_pre == "superbpe") {
2014                pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
2015                clean_spaces = false;
2016            } else if (
2017                tokenizer_pre == "trillion") {
2018                pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
2019                clean_spaces = false;
2020            } else if (
2021                tokenizer_pre == "granite-docling") {
2022                pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
2023                clean_spaces = false;
2024            } else if (
2025                tokenizer_pre == "bailingmoe" ||
2026                tokenizer_pre == "bailingmoe2" ||
2027                tokenizer_pre == "llada-moe") {
2028                pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
2029                clean_spaces = false;
2030            } else if (
2031                tokenizer_pre == "seed-coder") {
2032                pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
2033                clean_spaces = false;
2034            } else if (
2035                tokenizer_pre == "hunyuan") {
2036                pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
2037                clean_spaces = false;
2038            } else if (
2039                tokenizer_pre == "hunyuan-dense") {
2040                pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
2041                clean_spaces = false;
2042            } else if (
2043                tokenizer_pre == "kimi-k2") {
2044                pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
2045                clean_spaces = false;
2046            } else if (
2047                tokenizer_pre == "grok-2") {
2048                pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
2049                clean_spaces = false;
2050            } else if (
2051                tokenizer_pre == "afmoe") {
2052                pre_type = LLAMA_VOCAB_PRE_TYPE_AFMOE;
2053                clean_spaces = false;
2054            } else if (
2055                tokenizer_pre == "minimax-m2") {
2056                pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
2057                clean_spaces = false;
2058            } else if (
2059                tokenizer_pre == "solar-open") {
2060                pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
2061                clean_spaces = false;
2062            } else {
2063                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
2064            }
2065        } else if (type == LLAMA_VOCAB_TYPE_SPM) {
2066            pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2067            add_space_prefix = true;
2068            clean_spaces = false;
2069            add_bos = true;
2070            add_eos = false;
2071        } else if (type == LLAMA_VOCAB_TYPE_WPM) {
2072            pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2073            add_space_prefix = false;
2074            clean_spaces = true;
2075            add_bos = true;
2076            add_eos = false;
2077            add_sep = true;
2078        } else if (type == LLAMA_VOCAB_TYPE_UGM) {
2079            pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2080            add_bos = false;
2081            add_eos = true;
2082        } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
2083            pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2084            add_space_prefix = false;
2085            clean_spaces = false;
2086            add_bos = false;
2087            add_eos = false;
2088        } else {
2089            pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2090        }
2091
2092        ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX,      add_space_prefix,         false);
2093        ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
2094    }
2095
2096    const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
2097    if (token_idx == -1) {
2098        throw std::runtime_error("cannot find tokenizer vocab in model file\n");
2099    }
2100
2101    const float * scores = nullptr;
2102    const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
2103    if (score_idx != -1) {
2104        scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
2105    }
2106
2107    const int * toktypes = nullptr;
2108    const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
2109    if (toktype_idx != -1) {
2110        toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
2111    }
2112
2113    uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
2114    id_to_token.resize(n_tokens);
2115
2116    for (uint32_t i = 0; i < n_tokens; i++) {
2117        std::string word = gguf_get_arr_str(ctx, token_idx, i);
2118        if (word.empty()) {
2119            LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
2120            word = "[EMPTY_" + std::to_string(i) + "]";
2121        }
2122
2123        token_to_id[word] = i;
2124        max_token_len = std::max(max_token_len, (int) word.size());
2125
2126        auto & token_data = id_to_token[i];
2127        token_data.text  = std::move(word);
2128        token_data.score = scores ? scores[i] : 0.0f;
2129        token_data.attr  = LLAMA_TOKEN_ATTR_NORMAL;
2130
2131        if (toktypes) {  //TODO: remove, required until per token attributes are available from GGUF file
2132            switch(toktypes[i]) {
2133                case LLAMA_TOKEN_TYPE_UNKNOWN:      token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN;      break;
2134                case LLAMA_TOKEN_TYPE_UNUSED:       token_data.attr = LLAMA_TOKEN_ATTR_UNUSED;       break;
2135                case LLAMA_TOKEN_TYPE_NORMAL:       token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;       break;
2136                case LLAMA_TOKEN_TYPE_CONTROL:      token_data.attr = LLAMA_TOKEN_ATTR_CONTROL;      break;
2137                case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
2138                case LLAMA_TOKEN_TYPE_BYTE:         token_data.attr = LLAMA_TOKEN_ATTR_BYTE;         break;
2139                case LLAMA_TOKEN_TYPE_UNDEFINED:    token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED;    break;
2140                default:                            token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED;    break;
2141            }
2142        }
2143    }
2144    GGML_ASSERT(id_to_token.size() == token_to_id.size());
2145
2146    init_tokenizer(type);
2147
2148    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
2149    if (type == LLAMA_VOCAB_TYPE_SPM) {
2150        try {
2151            linefeed_id = vocab.byte_to_token('\n');
2152        } catch (const std::exception & e) {
2153            LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
2154            linefeed_id = special_pad_id;
2155        }
2156    } else if (type == LLAMA_VOCAB_TYPE_WPM) {
2157        linefeed_id = special_pad_id;
2158    } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
2159        const std::vector<int> ids = tokenize("\n", false);
2160        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2161        linefeed_id = ids[0];
2162    } else {
2163        const std::vector<int> ids = tokenize("\n", false);
2164
2165        //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2166        if (ids.empty()) {
2167            LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
2168            linefeed_id = special_pad_id;
2169        } else {
2170            linefeed_id = ids[0];
2171        }
2172    }
2173
2174    // special tokens
2175    {
2176        const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
2177            { LLM_KV_TOKENIZER_BOS_ID,     special_bos_id     },
2178            { LLM_KV_TOKENIZER_EOS_ID,     special_eos_id     },
2179            { LLM_KV_TOKENIZER_EOT_ID,     special_eot_id     },
2180            { LLM_KV_TOKENIZER_EOM_ID,     special_eom_id     },
2181            { LLM_KV_TOKENIZER_UNK_ID,     special_unk_id     },
2182            { LLM_KV_TOKENIZER_SEP_ID,     special_sep_id     },
2183            { LLM_KV_TOKENIZER_PAD_ID,     special_pad_id     },
2184            { LLM_KV_TOKENIZER_MASK_ID,    special_mask_id    },
2185            { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
2186            { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
2187            { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
2188            { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
2189            { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
2190            { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
2191
2192            // deprecated
2193            { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
2194            { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
2195            { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
2196        };
2197
2198        for (const auto & it : special_token_types) {
2199            const std::string & key = kv(std::get<0>(it));
2200            int32_t & id = std::get<1>(it);
2201
2202            uint32_t new_id;
2203            if (!ml.get_key(std::get<0>(it), new_id, false)) {
2204                continue;
2205            }
2206            if (new_id >= id_to_token.size()) {
2207                LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
2208                    __func__, key.c_str(), new_id, id);
2209            } else {
2210                id = new_id;
2211            }
2212        }
2213
2214        // Handle add_bos, add_eos and add_sep
2215        {
2216            bool temp = true;
2217
2218            if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
2219                add_bos = temp;
2220            }
2221            if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
2222                add_eos = temp;
2223            }
2224            if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
2225                add_sep = temp;
2226            }
2227        }
2228
2229        // auto-detect special tokens by text
2230        // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
2231        //       for now, we apply this workaround to find the tokens based on their text
2232
2233        for (const auto & t : token_to_id) {
2234            auto & attr = id_to_token[t.second].attr;
2235
2236            // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
2237            if (special_eot_id == LLAMA_TOKEN_NULL) {
2238                if (false
2239                        || t.first == "<|eot_id|>"
2240                        || t.first == "<|im_end|>"
2241                        || t.first == "<|end|>"
2242                        || t.first == "<end_of_turn>"
2243                        || t.first == "<|endoftext|>"
2244                        || t.first == "<|end_of_text|>" // granite
2245                        || t.first == "<EOT>"
2246                        || t.first == "_<EOT>"
2247                        || t.first == "[EOT]" // Kimi-K2
2248                        || t.first == "<｜end▁of▁sentence｜>" // DeepSeek
2249                        || t.first == "<end_of_utterance>" // smoldocling
2250                   ) {
2251                    special_eot_id = t.second;
2252                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2253                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2254                                __func__, t.second, t.first.c_str());
2255                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2256                    }
2257                }
2258            }
2259
2260            // find EOM token: "<|eom_id|>"
2261            if (special_eom_id == LLAMA_TOKEN_NULL) {
2262                if (false
2263                        || t.first == "<|eom_id|>"
2264                        ) {
2265                    special_eom_id = t.second;
2266                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2267                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2268                                __func__, t.second, t.first.c_str());
2269                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2270                    }
2271                }
2272            }
2273
2274            // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
2275            if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
2276                if (false
2277                        || t.first == "<|fim_prefix|>"  // Qwen
2278                        || t.first == "<fim-prefix>"
2279                        || t.first == "<fim_prefix>"    // Granite
2280                        || t.first == "<｜fim▁begin｜>" // DeepSeek
2281                        || t.first == "<PRE>"
2282                        || t.first == "▁<PRE>"          // CodeLlama
2283                        || t.first == "<|code_prefix|>" // GLM-4.5
2284                        || t.first == "<|prefix|>"      // Falcon-H1-Tiny-Coder
2285                        ) {
2286                    special_fim_pre_id = t.second;
2287                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2288                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2289                                __func__, t.second, t.first.c_str());
2290                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2291                    }
2292                }
2293            }
2294
2295            // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
2296            if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
2297                if (false
2298                        || t.first == "<|fim_suffix|>" // Qwen
2299                        || t.first == "<fim-suffix>"
2300                        || t.first == "<fim_suffix>"   // Granite
2301                        || t.first == "<｜fim▁hole｜>" // DeepSeek
2302                        || t.first == "<SUF>"
2303                        || t.first == "▁<SUF>"         // CodeLlama
2304                        || t.first == "<|code_suffix|>" // GLM-4.5
2305                        || t.first == "<|suffix|>"      // Falcon-H1-Tiny-Coder
2306                        ) {
2307                    special_fim_suf_id = t.second;
2308                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2309                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2310                                __func__, t.second, t.first.c_str());
2311                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2312                    }
2313                }
2314            }
2315
2316            // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
2317            if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
2318                if (false
2319                        || t.first == "<|fim_middle|>" // Qwen
2320                        || t.first == "<fim-middle>"
2321                        || t.first == "<fim_middle>"   // Granite
2322                        || t.first == "<｜fim▁end｜>"  // DeepSeek
2323                        || t.first == "<MID>"
2324                        || t.first == "▁<MID>"         // CodeLlama
2325                        || t.first == "<|code_middle|>" // GLM-4.5
2326                        || t.first == "<|middle|>"      // Falcon-H1-Tiny-Coder
2327                        ) {
2328                    special_fim_mid_id = t.second;
2329                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2330                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2331                                __func__, t.second, t.first.c_str());
2332                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2333                    }
2334                }
2335            }
2336
2337            // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
2338            if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
2339                if (false
2340                        || t.first == "<|fim_pad|>" // Qwen
2341                        || t.first == "<fim-pad>"
2342                        || t.first == "<fim_pad>"   // Granite
2343                        || t.first == "<PAD>"
2344                        || t.first == "[PAD]" // Kimi-K2
2345                        ) {
2346                    special_fim_pad_id = t.second;
2347                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2348                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2349                                __func__, t.second, t.first.c_str());
2350                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2351                    }
2352                }
2353            }
2354
2355            // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
2356            if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
2357                if (false
2358                        || t.first == "<|fim_repo|>"  // Qwen
2359                        || t.first == "<|repo_name|>"
2360                        || t.first == "<fim-repo>"
2361                        || t.first == "<REPO>"
2362                        || t.first == "<reponame>"    // Granite
2363                        ) {
2364                    special_fim_rep_id = t.second;
2365                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2366                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2367                                __func__, t.second, t.first.c_str());
2368                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2369                    }
2370                }
2371            }
2372
2373            // find FIM_SEP token: "<|file_sep|>"
2374            if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
2375                if (false
2376                        || t.first == "<|file_sep|>" // Qwen
2377                        ) {
2378                    special_fim_sep_id = t.second;
2379                    if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2380                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2381                                __func__, t.second, t.first.c_str());
2382                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2383                    }
2384                }
2385            }
2386        }
2387
2388        // auto-detect unused tokens: e.g. control tokens with the word "unused"
2389        // ideally, these tokens should be marked as unused during conversion
2390        {
2391            uint32_t n_unused = 0;
2392
2393            for (const auto & t : token_to_id) {
2394                auto & attr = id_to_token[t.second].attr;
2395
2396                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2397                    continue;
2398                }
2399
2400                if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
2401                    if (strstr(t.first.c_str(), "unused") != NULL) {
2402                        attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
2403                    }
2404                }
2405
2406                if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
2407                    n_unused++;
2408                }
2409            }
2410
2411            LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
2412        }
2413
2414        // maintain a list of tokens that cause end-of-generation
2415        // this is currently determined based on the token text, which is obviously not ideal
2416        // ref: https://github.com/ggml-org/llama.cpp/issues/9606
2417        special_eog_ids.clear();
2418
2419        if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
2420            special_eog_ids.insert(special_fim_pad_id);
2421        }
2422
2423        if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
2424            special_eog_ids.insert(special_fim_rep_id);
2425        }
2426
2427        if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
2428            special_eog_ids.insert(special_fim_sep_id);
2429        }
2430
2431        for (const auto & t : token_to_id) {
2432            auto & attr = id_to_token[t.second].attr;
2433
2434            if (false
2435                    || t.first == "<|eot_id|>"
2436                    || t.first == "<|im_end|>"
2437                    || t.first == "<|end|>"
2438                    || t.first == "<|return|>" // o200k_harmony
2439                    || t.first == "<|call|>"   // o200k_harmony
2440                    || t.first == "<|flush|>"  // solar-open
2441                    || t.first == "<|calls|>"  // solar-open
2442                    || t.first == "<end_of_turn>"
2443                    || t.first == "<|endoftext|>"
2444                    || t.first == "<|eom_id|>"
2445                    || t.first == "<EOT>"
2446                    || t.first == "_<EOT>"
2447                    || t.first == "[EOT]" // Kimi-K2
2448                    || t.first == "[EOS]" // Kimi-K2
2449                    || t.first == "<|end_of_text|>"
2450                    || t.first == "<end_of_utterance>" // smoldocling
2451               ) {
2452                special_eog_ids.insert(t.second);
2453                if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2454                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2455                            __func__, t.second, t.first.c_str());
2456                    attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2457                }
2458            } else {
2459                if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
2460                    // token is control, but not marked as EOG -> print a debug log
2461                    if (special_eog_ids.count(t.second) == 0) {
2462                        LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
2463                                __func__, t.second, t.first.c_str());
2464                    }
2465                }
2466            }
2467        }
2468
2469        // @ngxson : quick hack for gpt-oss, always render these tokens
2470        for (const auto & t : token_to_id) {
2471            auto & attr = id_to_token[t.second].attr;
2472
2473            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2474                LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n",
2475                        __func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
2476
2477                attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2478            }
2479        }
2480
2481        // sanity checks
2482        if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
2483            special_eog_ids.insert(special_eos_id);
2484            LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2485        }
2486
2487        if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
2488            special_eog_ids.insert(special_eot_id);
2489            LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2490        }
2491
2492        if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
2493            special_eog_ids.insert(special_eom_id);
2494            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2495        }
2496
2497        // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
2498        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
2499        //       we remove the "<|end|>" token from the EOG list
2500        {
2501            bool has_return = false;
2502            bool has_call   = false;
2503            bool has_end    = false;
2504            bool has_flush  = false;
2505
2506            llama_token end_id = LLAMA_TOKEN_NULL;
2507
2508            LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2509            for (auto tid : special_eog_ids) {
2510                auto & text = id_to_token[tid].text;
2511
2512                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, text.c_str());
2513
2514                if (text == "<|return|>") {
2515                    has_return = true;
2516                } else if (text == "<|call|>" || text == "<|calls|>") {
2517                    has_call = true;
2518                } else if (text == "<|flush|>") {
2519                    has_flush = true;
2520                } else if (text == "<|end|>") {
2521                    has_end = true;
2522                    end_id = tid;
2523                }
2524            }
2525
2526            if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
2527                special_eog_ids.erase(end_id);
2528
2529                auto & attr = id_to_token[end_id].attr;
2530                attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2531
2532                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2533            }
2534        }
2535    }
2536
2537    // build special tokens cache
2538    {
2539        for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
2540            if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
2541                cache_special_tokens.push_back(id);
2542            }
2543        }
2544
2545        std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
2546            [&] (const llama_token a, const llama_token b) {
2547                return id_to_token[a].text.size() > id_to_token[b].text.size();
2548            }
2549        );
2550
2551        LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
2552    }
2553
2554    // build token to piece cache
2555    {
2556        size_t size_cache = 0;
2557
2558        std::vector<std::string> cache(n_tokens);
2559
2560        for (uint32_t id = 0; id < n_tokens; ++id) {
2561            cache[id] = token_to_piece_for_cache(id, true);
2562
2563            size_cache += cache[id].size();
2564        }
2565
2566        std::swap(cache_token_to_piece, cache);
2567
2568        LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
2569    }
2570
2571    // Handle per token attributes
2572    //NOTE: Each model customizes per token attributes.
2573    //NOTE: Per token attributes are missing from the GGUF file.
2574    //TODO: Extract attributes from GGUF file.
2575    {
2576        auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
2577            for (const auto & substr : substrs) {
2578                if (str.find(substr) != std::string::npos) {
2579                    return true;
2580                }
2581            }
2582            return false;
2583        };
2584
2585        auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
2586            uint32_t current = id_to_token.at(id).attr;
2587            current = value ? (current | attr) : (current & ~attr);
2588            id_to_token[id].attr = (llama_token_attr) current;
2589        };
2590
2591        auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
2592            _set_tokenid_attr(token_to_id.at(token), attr, value);
2593        };
2594
2595        std::string model_name;
2596        std::string tokenizer_pre;
2597        std::string general_arch;
2598
2599        ml.get_key(LLM_KV_GENERAL_NAME,  model_name,    false);
2600        ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
2601        ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
2602
2603        // model name to lowercase
2604        std::transform(model_name.begin(), model_name.end(), model_name.begin(),
2605            [] (const std::string::value_type x) {
2606                return std::tolower(x);
2607            }
2608        );
2609
2610        // set attributes by model/tokenizer/architecture name
2611        if (false
2612                || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2613                || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
2614           ) {
2615            if (token_to_id.count("<mask>") == 0) {
2616                LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
2617            } else {
2618                _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2619            }
2620        } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
2621            for (auto id : cache_special_tokens) {
2622                _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
2623            }
2624            for (const auto * token : {"</s>"}) {
2625                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
2626            }
2627            for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
2628                _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
2629            }
2630        } else if (_contains_any(model_name, {"modern-bert"})) {
2631            if (token_to_id.count("[MASK]") == 0 ) {
2632                LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
2633            }
2634            else {
2635                _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
2636            }
2637        }
2638    }
2639}
2640
2641enum llama_vocab_type llama_vocab::impl::get_type() const {
2642    return type;
2643}
2644
2645std::string llama_vocab::impl::type_name() const{
2646    switch (type) {
2647        case LLAMA_VOCAB_TYPE_NONE:   return "no vocab";
2648        case LLAMA_VOCAB_TYPE_SPM:    return "SPM";
2649        case LLAMA_VOCAB_TYPE_BPE:    return "BPE";
2650        case LLAMA_VOCAB_TYPE_WPM:    return "WPM";
2651        case LLAMA_VOCAB_TYPE_UGM:    return "UGM";
2652        case LLAMA_VOCAB_TYPE_RWKV:   return "RWKV";
2653        case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
2654        default:                      return "unknown";
2655    }
2656}
2657
2658bool llama_vocab::impl::is_normal(llama_token id) const {
2659    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2660    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
2661}
2662
2663bool llama_vocab::impl::is_unknown(llama_token id) const {
2664    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2665    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
2666}
2667
2668bool llama_vocab::impl::is_control(llama_token id) const {
2669    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2670    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
2671}
2672
2673bool llama_vocab::impl::is_byte(llama_token id) const {
2674    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2675    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
2676}
2677
2678bool llama_vocab::impl::is_user_defined(llama_token id) const {
2679    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2680    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
2681}
2682
2683bool llama_vocab::impl::is_unused(llama_token id) const {
2684    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2685    return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
2686}
2687
2688bool llama_vocab::impl::is_eog(llama_token id) const {
2689    return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
2690}
2691
2692uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
2693    GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
2694    GGML_ASSERT(is_byte(id));
2695    const auto & token_data = id_to_token.at(id);
2696    switch (get_type()) {
2697        case LLAMA_VOCAB_TYPE_SPM:
2698        case LLAMA_VOCAB_TYPE_UGM: {
2699            auto buf = token_data.text.substr(3, 2);
2700            return strtol(buf.c_str(), NULL, 16);
2701        }
2702        case LLAMA_VOCAB_TYPE_BPE: {
2703            GGML_ABORT("fatal error");
2704        }
2705        case LLAMA_VOCAB_TYPE_WPM: {
2706            GGML_ABORT("fatal error");
2707        }
2708        default:
2709            GGML_ABORT("fatal error");
2710    }
2711}
2712
2713llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
2714    GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2715    return id_to_token.at(id).attr;
2716}
2717
2718void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
2719    LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
2720
2721    switch (type) {
2722        case LLAMA_VOCAB_TYPE_SPM:
2723            tokenizer = std::make_unique<llm_tokenizer_spm>(vocab);
2724            break;
2725        case LLAMA_VOCAB_TYPE_BPE:
2726            tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
2727            break;
2728        case LLAMA_VOCAB_TYPE_WPM:
2729            tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
2730            break;
2731        case LLAMA_VOCAB_TYPE_UGM:
2732            tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
2733            break;
2734        case LLAMA_VOCAB_TYPE_RWKV:
2735            tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
2736            break;
2737        case LLAMA_VOCAB_TYPE_PLAMO2:
2738            tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
2739            break;
2740        default:
2741            GGML_ABORT("unsupported vocab type");
2742    }
2743}
2744
2745//
2746// (de-) tokenize
2747//
2748
2749// #define PRETOKENIZERDEBUG
2750
2751void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
2752    // for each special token
2753    for (const llama_token special_id : cache_special_tokens) {
2754        const auto & data = vocab.get_token_data(special_id);
2755        const auto & text = data.text;
2756
2757        if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
2758            // Ignore control and unknown tokens when parse_special == false
2759            continue;
2760            // User-defined tokens are still pre-tokenized before everything else
2761            // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
2762            // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
2763        }
2764
2765        // for each text fragment
2766        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
2767        while (it != buffer.end()) {
2768            auto & fragment = (*it);
2769
2770            // if a fragment is text ( not yet processed )
2771            if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2772                const auto & raw_text = fragment.raw_text;
2773
2774                auto raw_text_base_offset = fragment.offset;
2775                auto raw_text_base_length = fragment.length;
2776
2777                // loop over the text
2778                while (true) {
2779                    // find the first occurrence of a given special token in this fragment
2780                    //  passing offset argument only limit the "search area" but match coordinates
2781                    //  are still relative to the source full raw_text
2782                    //  string_view begins at pos 0 for the same reason
2783                    auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
2784
2785                    // no occurrences found, stop processing this fragment for a given special token
2786                    if (match == std::string::npos) break;
2787
2788#ifdef PRETOKENIZERDEBUG
2789                    LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2790#endif
2791                    auto source = std::distance(buffer.begin(), it);
2792
2793                    // if match is further than base offset
2794                    //  then we have some text to the left of it
2795                    if (match > raw_text_base_offset) {
2796                        // left
2797                        const int64_t left_reminder_offset = raw_text_base_offset + 0;
2798                        int64_t left_reminder_length = match - raw_text_base_offset;
2799
2800                        if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
2801                            while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
2802                                left_reminder_length--;
2803                            }
2804                        }
2805
2806                        if (left_reminder_length > 0) {
2807                            buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
2808                            it++;
2809                        }
2810
2811#ifdef PRETOKENIZERDEBUG
2812                        LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
2813#endif
2814                    }
2815
2816                    // special token
2817                    buffer.emplace_after(it, special_id);
2818                    it++;
2819
2820                    // right
2821                    if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
2822                        int64_t right_reminder_offset = match + text.length();
2823                        int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
2824
2825                        if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
2826                            while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
2827                                right_reminder_offset++;
2828                                right_reminder_length--;
2829                            }
2830                        }
2831
2832                        if (right_reminder_length > 0) {
2833                            buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
2834                            it++;
2835                        }
2836
2837#ifdef PRETOKENIZERDEBUG
2838                        LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
2839#endif
2840
2841                        if (source == 0) {
2842                            buffer.erase_after(buffer.before_begin());
2843                        } else {
2844                            buffer.erase_after(std::next(buffer.begin(), (source - 1)));
2845                        }
2846
2847                        // repeat for the right side
2848                        raw_text_base_offset = right_reminder_offset;
2849                        raw_text_base_length = right_reminder_length;
2850
2851#ifdef PRETOKENIZERDEBUG
2852                        LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2853#endif
2854                    } else {
2855                        if (source == 0) {
2856                            buffer.erase_after(buffer.before_begin());
2857                        } else {
2858                            buffer.erase_after(std::next(buffer.begin(), (source - 1)));
2859                        }
2860                        break;
2861                    }
2862                }
2863            }
2864            it++;
2865        }
2866    }
2867}
2868
2869// NOTE: avoid ever using this except for building the token_to_piece caches
2870std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
2871    std::string piece;
2872    piece.resize(piece.capacity());  // using string internal cache
2873    const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
2874    if (n_chars < 0) {
2875        piece.resize(-n_chars);
2876        int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
2877        GGML_ASSERT(check == -n_chars);
2878    }
2879    else {
2880        piece.resize(n_chars);
2881    }
2882
2883    return piece;
2884}
2885
2886static void llama_escape_whitespace(std::string & text) {
2887    replace_all(text, " ", "\xe2\x96\x81");
2888}
2889
2890static void llama_unescape_whitespace(std::string & word) {
2891    replace_all(word, "\xe2\x96\x81", " ");
2892}
2893
2894static std::string llama_decode_text(const std::string & text) {
2895    std::string decoded_text;
2896
2897    const auto cpts = unicode_cpts_from_utf8(text);
2898    for (const auto cpt : cpts) {
2899        const auto utf8 = unicode_cpt_to_utf8(cpt);
2900        try {
2901            decoded_text += unicode_utf8_to_byte(utf8);
2902        } catch (const std::out_of_range & /*e*/) {
2903            decoded_text += "[UNK_BYTE_0x";
2904            for (const auto c : utf8) {
2905                decoded_text += format("%02x", (uint8_t) c);
2906            }
2907            decoded_text += text + "]";
2908        }
2909    }
2910
2911    return decoded_text;
2912}
2913
2914std::vector<llama_token> llama_vocab::impl::tokenize(
2915        const std::string & raw_text,
2916        bool add_special,
2917        bool parse_special) const {
2918    GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
2919
2920    std::vector<llama_token> output;
2921    std::forward_list<fragment_buffer_variant> fragment_buffer;
2922
2923    if (!raw_text.empty()) {
2924        fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
2925        tokenizer_st_partition(fragment_buffer, parse_special);
2926    }
2927
2928    switch (get_type()) {
2929        case LLAMA_VOCAB_TYPE_SPM:
2930            {
2931                // OG tokenizer behavior:
2932                //
2933                // tokenizer.encode('', add_special_tokens=True)  returns [1]
2934                // tokenizer.encode('', add_special_tokens=False) returns []
2935
2936                bool is_prev_special = true;  // prefix with space if first token
2937
2938                if (add_special && add_bos) {
2939                    GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2940                    output.push_back(special_bos_id);
2941                    is_prev_special = true;
2942                }
2943
2944                for (const auto & fragment : fragment_buffer) {
2945                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2946                        std::string text;
2947
2948                        // prefix with space if previous is special
2949                        if (add_space_prefix && is_prev_special) {
2950                            text = ' ';
2951                        }
2952
2953                        text += fragment.raw_text.substr(fragment.offset, fragment.length);
2954
2955#ifdef PRETOKENIZERDEBUG
2956                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2957#endif
2958                        llama_escape_whitespace(text);
2959                        llm_tokenizer_spm_session session(vocab);
2960                        session.tokenize(text, output);
2961                        is_prev_special = false;
2962                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2963                        output.push_back(fragment.token);
2964                        is_prev_special = true;
2965                    }
2966                }
2967
2968                if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
2969                    LLAMA_LOG_WARN(
2970                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
2971                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
2972                        "Are you sure this is what you want?\n", __FUNCTION__);
2973                }
2974
2975                if (add_special && add_eos) {
2976                    GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
2977                    output.push_back(special_eos_id);
2978                }
2979            } break;
2980        case LLAMA_VOCAB_TYPE_BPE:
2981            {
2982                llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
2983                // it calls some other methods that are not exist in llm_tokenizer,
2984                // here just cast it to bpe tokenizer object
2985                if (add_special) {
2986                    session.append_bos(output);
2987                }
2988                for (const auto & fragment : fragment_buffer) {
2989                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2990                        std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2991
2992#ifdef PRETOKENIZERDEBUG
2993                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2994#endif
2995                        session.tokenize(text, output);
2996                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2997                        session.append(fragment.token, output);
2998                    }
2999                }
3000
3001                if (add_special) {
3002                    session.append_eos(output);
3003                    session.check_double_bos_eos(output);
3004                }
3005            } break;
3006        case LLAMA_VOCAB_TYPE_WPM:
3007            {
3008                if (add_special) {
3009                    GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
3010                    output.push_back(special_bos_id);
3011                }
3012
3013                llm_tokenizer_wpm_session session(vocab);
3014
3015                for (const auto & fragment : fragment_buffer) {
3016                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
3017                        std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
3018
3019#ifdef PRETOKENIZERDEBUG
3020                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
3021#endif
3022                        session.tokenize(text, output);
3023                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
3024                        output.push_back(fragment.token);
3025                    }
3026                }
3027
3028                if (add_special) {
3029                    GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
3030                    output.push_back(special_sep_id);
3031                }
3032            } break;
3033        case LLAMA_VOCAB_TYPE_UGM:
3034            {
3035                if (add_special && add_bos) {
3036                    GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
3037                    output.push_back(special_bos_id);
3038                }
3039                llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
3040
3041                for (const auto & fragment : fragment_buffer) {
3042                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
3043                        std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
3044#ifdef PRETOKENIZERDEBUG
3045                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
3046#endif
3047                        session.tokenize(text, output);
3048                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
3049                        output.push_back(fragment.token);
3050                    }
3051                }
3052
3053                if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
3054                    LLAMA_LOG_WARN(
3055                        "%s: Added a BOS token to the prompt as specified by the model but the prompt "
3056                        "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
3057                        "Are you sure this is what you want?\n", __FUNCTION__);
3058                }
3059
3060                if (add_special && add_eos) {
3061                    GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
3062                    output.push_back(special_eos_id);
3063                }
3064            } break;
3065        case LLAMA_VOCAB_TYPE_RWKV:
3066            {
3067                llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
3068                for (const auto & fragment : fragment_buffer) {
3069                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
3070                        std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
3071
3072#ifdef PRETOKENIZERDEBUG
3073                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
3074#endif
3075
3076                        session.tokenize(text, output);
3077                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
3078                        output.push_back(fragment.token);
3079                    }
3080                }
3081            } break;
3082        case LLAMA_VOCAB_TYPE_PLAMO2:
3083            {
3084                llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
3085                for (const auto & fragment : fragment_buffer) {
3086                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
3087                        std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
3088
3089#ifdef PRETOKENIZERDEBUG
3090                        LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
3091#endif
3092
3093                        session.tokenize(text, output);
3094                    } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
3095                        output.push_back(fragment.token);
3096                    }
3097                }
3098            } break;
3099        case LLAMA_VOCAB_TYPE_NONE:
3100            GGML_ABORT("fatal error");
3101    }
3102
3103    return output;
3104}
3105
3106int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
3107    // ref: https://github.com/ggml-org/llama.cpp/pull/7587#discussion_r1620983843
3108    static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
3109    const llama_token_attr attr = token_get_attr(token);
3110    if (!special && (attr & attr_special)) {
3111        return 0;
3112    }
3113
3114    // copy piece chars to output text buffer
3115    // skip up to 'lstrip' leading spaces before copying
3116    auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
3117        if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3118            GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
3119        }
3120
3121        for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
3122            token++;
3123            size--;
3124        }
3125        if (length < (int32_t)size) {
3126            return -(int32_t) size;
3127        }
3128        memcpy(buf, token, size);
3129        return (int32_t) size;
3130    };
3131
3132    // if we have a cache - use it
3133    {
3134        const auto & cache = cache_token_to_piece;
3135
3136        if (!cache.empty()) {
3137            const auto & result = cache.at(token);
3138            return _try_copy(result.data(), result.size());
3139        }
3140    }
3141
3142    if (0 <= token && token < (int32_t) id_to_token.size()) {
3143        const std::string & token_text = id_to_token[token].text;
3144        switch (get_type()) {
3145            case LLAMA_VOCAB_TYPE_WPM:
3146            case LLAMA_VOCAB_TYPE_SPM:
3147            case LLAMA_VOCAB_TYPE_UGM: {
3148                // NOTE: we accept all unsupported token types,
3149                // suppressing them like CONTROL tokens.
3150                if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3151                    return _try_copy(token_text.data(), token_text.size());
3152                }
3153                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3154                    std::string result = token_text;
3155                    llama_unescape_whitespace(result);
3156                    return _try_copy(result.data(), result.size());
3157                }
3158                if (attr & LLAMA_TOKEN_ATTR_BYTE) {
3159                    char byte = (char) token_to_byte(token);
3160                    return _try_copy((char*) &byte, 1);
3161                }
3162                break;
3163            }
3164            case LLAMA_VOCAB_TYPE_BPE: {
3165                // NOTE: we accept all unsupported token types,
3166                // suppressing them like CONTROL tokens.
3167                if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3168                    return _try_copy(token_text.data(), token_text.size());
3169                }
3170                if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3171                    std::string result = llama_decode_text(token_text);
3172                    return _try_copy(result.data(), result.size());
3173                }
3174                break;
3175            }
3176            case LLAMA_VOCAB_TYPE_RWKV: {
3177                std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
3178
3179                // If we don't have enough space, return an error
3180                if (result.size() > (size_t)length) {
3181                    return -(int)result.size();
3182                }
3183
3184                memcpy(buf, result.data(), result.size());
3185                return (int)result.size();
3186            }
3187            case LLAMA_VOCAB_TYPE_PLAMO2: {
3188                // PLaMo-2 uses similar token handling as BPE/SPM
3189                if (vocab.is_byte(token)) {
3190                    // Handle byte tokens like <0xXX>
3191                    if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
3192                        int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
3193                        if (length < 1) {
3194                            return -1;
3195                        }
3196                        buf[0] = static_cast<char>(hex_val);
3197                        return 1;
3198                    }
3199                }
3200
3201                // Normal token - just copy the text
3202                std::string result = token_text;
3203                return _try_copy(result.data(), result.size());
3204            }
3205            default:
3206                GGML_ABORT("fatal error");
3207        }
3208    }
3209
3210    return 0;
3211}
3212
3213const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
3214    return cache_token_to_piece.at(token);
3215}
3216
3217int32_t llama_vocab::impl::detokenize(
3218               const llama_token * tokens,
3219                         int32_t   n_tokens,
3220                            char * text,
3221                         int32_t   text_len_max,
3222                            bool   remove_special,
3223                            bool   unparse_special) const {
3224    if (type == LLAMA_VOCAB_TYPE_NONE) {
3225        return 0;
3226    }
3227
3228    GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
3229
3230    int32_t avail = text_len_max;
3231    int32_t total = 0;
3232
3233    // remove the leading space
3234    bool remove_space = add_space_prefix;
3235
3236    if (remove_special && add_bos) {
3237        if (n_tokens > 0 && tokens[0] == special_bos_id) {
3238            remove_space = false;
3239            n_tokens--;
3240            tokens++;
3241        }
3242    }
3243
3244    if (remove_special && add_eos) {
3245        if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
3246            n_tokens--;
3247        }
3248    }
3249
3250    for (int32_t i = 0; i < n_tokens; ++i) {
3251        GGML_ASSERT(avail >= 0);
3252        int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special);
3253        remove_space = false;
3254        if (n_chars < 0) {
3255            avail = 0;
3256            total -= n_chars;
3257        } else if (n_chars > 0) {
3258            avail -= n_chars;
3259            text  += n_chars;
3260            total += n_chars;
3261        }
3262    }
3263
3264    if (total > text_len_max) {
3265        return -total;
3266    }
3267
3268    if (clean_spaces) {
3269        text -= total;  // restart text
3270
3271        // first pass: characters ?!.,  //TODO: where do these characters come from?
3272        const int32_t total1 = total;
3273        total = total ? 1 : 0;
3274        for (int32_t i = 1; i < total1; ++i) {
3275            const char x = text[i];
3276            if (text[i - 1] == ' ') {
3277                if (x == '?' || x == '!' || x == '.' || x == ',') {  // " ?", " !", " .", " ,"
3278                    total--;  // remove space
3279                }
3280            }
3281            text[total++] = x;
3282        }
3283
3284        // second pass: strip single apostrophe between spaces
3285        const int32_t total2 = total;
3286        total = total ? 1 : 0;
3287        for (int32_t i = 1; i < total2; ++i) {
3288            const char x = text[i];
3289            if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') {  // " ' "
3290                total--;           // remove prev space
3291                text[++i] = '\0';  // remove next space
3292            }
3293            text[total++] = x;
3294        }
3295
3296        // third pass: apostrophe contractions  //NOTE: this makes sense?
3297        const int32_t total3 = total;
3298        total = total ? 1 : 0;
3299        for (int32_t i = 1; i < total3; ++i) {
3300            const char x = text[i];
3301            if (text[i - 1] == ' ') {
3302                if (x == '\'' && i + 1 < total3) {
3303                    const char x1 = text[i + 1];
3304                    if (x1 == 't' || x1 == 'd') {  // " 't", " 'd"
3305                        //total--;  // remove space
3306                    } else if (x1 == 's' || x1 == 'm') {  // " 's", " 'm"
3307                        total--;  // remove space
3308                    } else if (i + 2 < total3) {
3309                        const char x2 = text[i + 2];
3310                        if ((x1 == 'l' && x2 == 'l')) {  // " 'll"
3311                            //total--;  // remove space
3312                        } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) {  // " 're", " 've"
3313                            total--;  // remove space
3314                        } else {
3315                            //total--;  // remove space
3316                        }
3317                    } else {
3318                        //total--;  // remove space
3319                    }
3320                }
3321            }
3322            text[total++] = x;
3323        }
3324    }
3325
3326    return total <= text_len_max ? total : -total;
3327}
3328
3329void llama_vocab::impl::print_info() const {
3330    LLAMA_LOG_INFO("%s: vocab type            = %s\n",     __func__, type_name().c_str());
3331    LLAMA_LOG_INFO("%s: n_vocab               = %u\n",     __func__, vocab.n_tokens());
3332    LLAMA_LOG_INFO("%s: n_merges              = %u\n",     __func__, (uint32_t) bpe_ranks.size());
3333
3334    // special tokens
3335    if (special_bos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: BOS token             = %d '%s'\n", __func__, special_bos_id,     id_to_token.at(special_bos_id).text.c_str() );  }
3336    if (special_eos_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOS token             = %d '%s'\n", __func__, special_eos_id,     id_to_token.at(special_eos_id).text.c_str() );  }
3337    if (special_eot_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOT token             = %d '%s'\n", __func__, special_eot_id,     id_to_token.at(special_eot_id).text.c_str() );  }
3338    if (special_eom_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: EOM token             = %d '%s'\n", __func__, special_eom_id,     id_to_token.at(special_eom_id).text.c_str() );  }
3339    if (special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token             = %d '%s'\n", __func__, special_unk_id,     id_to_token.at(special_unk_id).text.c_str() );  }
3340    if (special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token             = %d '%s'\n", __func__, special_sep_id,     id_to_token.at(special_sep_id).text.c_str() );  }
3341    if (special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token             = %d '%s'\n", __func__, special_pad_id,     id_to_token.at(special_pad_id).text.c_str() );  }
3342    if (special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token            = %d '%s'\n", __func__, special_mask_id,    id_to_token.at(special_mask_id).text.c_str() ); }
3343
3344    if (linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token              = %d '%s'\n", __func__, linefeed_id,        id_to_token.at(linefeed_id).text.c_str() ); }
3345
3346    if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token         = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
3347    if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token         = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
3348    if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token         = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
3349    if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token         = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
3350    if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token         = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
3351    if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token         = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
3352
3353    for (const auto & id : special_eog_ids) {
3354        LLAMA_LOG_INFO( "%s: EOG token             = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
3355    }
3356
3357    LLAMA_LOG_INFO("%s: max token length      = %d\n", __func__, max_token_len);
3358}
3359
3360llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
3361}
3362
3363llama_vocab::~llama_vocab() = default;
3364
3365void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
3366    pimpl->load(ml, kv);
3367}
3368
3369std::string llama_vocab::get_tokenizer_model() const {
3370    return pimpl->tokenizer_model;
3371}
3372
3373std::string llama_vocab::get_tokenizer_pre() const {
3374    return pimpl->tokenizer_pre;
3375}
3376
3377enum llama_vocab_type llama_vocab::get_type() const {
3378    return pimpl->type;
3379}
3380
3381enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
3382    return pimpl->pre_type;
3383}
3384
3385uint32_t llama_vocab::n_tokens() const {
3386    return (uint32_t) pimpl->id_to_token.size();
3387}
3388
3389uint32_t llama_vocab::n_token_types() const {
3390    return (uint32_t) pimpl->n_token_types;
3391}
3392
3393std::string llama_vocab::type_name() const{
3394    return pimpl->type_name();
3395}
3396
3397bool llama_vocab::is_normal(llama_token id) const {
3398    return pimpl->is_normal(id);
3399}
3400
3401bool llama_vocab::is_unknown(llama_token id) const {
3402    return pimpl->is_unknown(id);
3403}
3404
3405bool llama_vocab::is_control(llama_token id) const {
3406    return pimpl->is_control(id);
3407}
3408
3409bool llama_vocab::is_byte(llama_token id) const {
3410    return pimpl->is_byte(id);
3411}
3412
3413bool llama_vocab::is_user_defined(llama_token id) const {
3414    return pimpl->is_user_defined(id);
3415}
3416
3417bool llama_vocab::is_unused(llama_token id) const {
3418    return pimpl->is_unused(id);
3419}
3420
3421bool llama_vocab::is_eog(llama_token id) const {
3422    return pimpl->is_eog(id);
3423}
3424
3425uint8_t llama_vocab::token_to_byte(llama_token id) const {
3426    return pimpl->token_to_byte(id);
3427}
3428
3429llama_token llama_vocab::byte_to_token(uint8_t ch) const {
3430    GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
3431    static const char * hex = "0123456789ABCDEF";
3432    switch (get_type()) {
3433        case LLAMA_VOCAB_TYPE_SPM:
3434        case LLAMA_VOCAB_TYPE_UGM: {
3435            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
3436            auto token = pimpl->token_to_id.find(buf);
3437            if (token != pimpl->token_to_id.end()) {
3438                return (*token).second;
3439            }
3440            // Try to fall back to just the byte as a string
3441            const char buf2[2] = { (char)ch, 0 };
3442            return pimpl->token_to_id.at(buf2);
3443        }
3444        case LLAMA_VOCAB_TYPE_WPM:
3445        case LLAMA_VOCAB_TYPE_BPE: {
3446            return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
3447        }
3448        case LLAMA_VOCAB_TYPE_PLAMO2: {
3449            // PLaMo-2 uses byte tokens in format <0xXX>
3450            char hex_str[8];
3451            snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
3452            return pimpl->token_to_id.at(hex_str);
3453        }
3454        default:
3455            GGML_ABORT("fatal error");
3456    }
3457}
3458
3459llama_token llama_vocab::text_to_token(const std::string & text) const {
3460    GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3461    auto it = pimpl->token_to_id.find(text);
3462    if (it != pimpl->token_to_id.end()) {
3463        return (*it).second;
3464    }
3465    return LLAMA_TOKEN_NULL;
3466}
3467
3468const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
3469    GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3470    return pimpl->id_to_token.at(id);
3471}
3472
3473const char * llama_vocab::token_get_text(llama_token id) const {
3474    GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3475    return pimpl->id_to_token.at(id).text.c_str();
3476}
3477
3478float llama_vocab::token_get_score(llama_token id) const {
3479    GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3480    return pimpl->id_to_token.at(id).score;
3481}
3482
3483llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
3484    return pimpl->token_get_attr(id);
3485}
3486
3487llama_token llama_vocab::token_bos() const {
3488    return pimpl->special_bos_id;
3489}
3490
3491llama_token llama_vocab::token_eos() const {
3492    return pimpl->special_eos_id;
3493}
3494
3495llama_token llama_vocab::token_eot() const {
3496    return pimpl->special_eot_id;
3497}
3498
3499llama_token llama_vocab::token_eom() const {
3500    return pimpl->special_eom_id;
3501}
3502
3503llama_token llama_vocab::token_unk() const {
3504    return pimpl->special_unk_id;
3505}
3506
3507llama_token llama_vocab::token_sep() const {
3508    return pimpl->special_sep_id;
3509}
3510
3511llama_token llama_vocab::token_nl() const {
3512    return pimpl->linefeed_id;
3513}
3514
3515llama_token llama_vocab::token_pad() const {
3516    return pimpl->special_pad_id;
3517}
3518
3519llama_token llama_vocab::token_prefix() const {
3520    return pimpl->special_fim_pre_id;
3521}
3522
3523llama_token llama_vocab::token_middle() const {
3524    return pimpl->special_fim_mid_id;
3525}
3526
3527llama_token llama_vocab::token_suffix() const {
3528    return pimpl->special_fim_suf_id;
3529}
3530
3531llama_token llama_vocab::token_fim_pre() const {
3532    return pimpl->special_fim_pre_id;
3533}
3534
3535llama_token llama_vocab::token_fim_suf() const {
3536    return pimpl->special_fim_suf_id;
3537}
3538
3539llama_token llama_vocab::token_fim_mid() const {
3540    return pimpl->special_fim_mid_id;
3541}
3542
3543llama_token llama_vocab::token_fim_pad() const {
3544    return pimpl->special_fim_pad_id;
3545}
3546
3547llama_token llama_vocab::token_fim_rep() const {
3548    return pimpl->special_fim_rep_id;
3549}
3550
3551llama_token llama_vocab::token_fim_sep() const {
3552    return pimpl->special_fim_sep_id;
3553}
3554
3555llama_token llama_vocab::token_mask() const {
3556    return pimpl->special_mask_id;
3557}
3558
3559bool llama_vocab::get_add_space_prefix() const {
3560    return pimpl->add_space_prefix;
3561}
3562
3563bool llama_vocab::get_add_bos() const {
3564    return pimpl->add_bos;
3565}
3566
3567bool llama_vocab::get_add_eos() const {
3568    return pimpl->add_eos;
3569}
3570
3571bool llama_vocab::get_add_sep() const {
3572    return pimpl->add_sep;
3573}
3574
3575bool llama_vocab::get_ignore_merges() const {
3576    return pimpl->ignore_merges;
3577}
3578
3579bool llama_vocab::get_clean_spaces() const {
3580    return pimpl->clean_spaces;
3581}
3582
3583bool llama_vocab::get_remove_extra_whitespaces() const {
3584    return pimpl->remove_extra_whitespaces;
3585}
3586
3587bool llama_vocab::get_escape_whitespaces() const {
3588    return pimpl->escape_whitespaces;
3589}
3590
3591bool llama_vocab::get_treat_whitespace_as_suffix() const {
3592    return pimpl->treat_whitespace_as_suffix;
3593}
3594
3595int llama_vocab::max_token_len() const {
3596    return pimpl->max_token_len;
3597}
3598
3599int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
3600    GGML_ASSERT(token_left.find(' ')   == std::string::npos);
3601    GGML_ASSERT(token_left.find('\n')  == std::string::npos);
3602    GGML_ASSERT(token_right.find(' ')  == std::string::npos);
3603    GGML_ASSERT(token_right.find('\n') == std::string::npos);
3604
3605    auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
3606    if (it == pimpl->bpe_ranks.end()) {
3607        return -1;
3608    }
3609
3610    return it->second;
3611}
3612
3613std::vector<std::string> llama_vocab::get_bpe_merges() const {
3614    std::vector<std::string> result(pimpl->bpe_ranks.size());
3615
3616    for (const auto & pair : pimpl->bpe_ranks) {
3617        result[pair.second] = pair.first.first + " " + pair.first.second;
3618    }
3619
3620    return result;
3621}
3622
3623std::vector<char> llama_vocab::get_precompiled_charsmap() const {
3624    return pimpl->precompiled_charsmap;
3625}
3626
3627int32_t llama_vocab::tokenize(
3628                  const char * text,
3629                     int32_t   text_len,
3630                 llama_token * tokens,
3631                     int32_t   n_tokens_max,
3632                        bool   add_special,
3633                        bool   parse_special) const {
3634    auto res = tokenize(std::string(text, text_len), add_special, parse_special);
3635    if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3636        LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3637        return std::numeric_limits<int32_t>::min();
3638    }
3639
3640    if (n_tokens_max < (int) res.size()) {
3641        // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3642        return -((int) res.size());
3643    }
3644
3645    for (size_t i = 0; i < res.size(); i++) {
3646        tokens[i] = res[i];
3647    }
3648
3649    return res.size();
3650}
3651
3652std::vector<llama_token> llama_vocab::tokenize(
3653        const std::string & raw_text,
3654        bool add_special,
3655        bool parse_special) const {
3656    return pimpl->tokenize(raw_text, add_special, parse_special);
3657}
3658
3659const std::string & llama_vocab::token_to_piece(llama_token token) const {
3660    return pimpl->token_to_piece(token);
3661}
3662
3663int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
3664    return pimpl->token_to_piece(token, buf, length, lstrip, special);
3665}
3666
3667int32_t llama_vocab::detokenize(
3668               const llama_token * tokens,
3669                         int32_t   n_tokens,
3670                            char * text,
3671                         int32_t   text_len_max,
3672                            bool   remove_special,
3673                            bool   unparse_special) const {
3674    return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3675}
3676
3677std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
3678    std::string text;
3679    text.resize(std::max(text.capacity(), tokens.size()));
3680    int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
3681    if (n_chars < 0) {
3682        text.resize(-n_chars);
3683        n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
3684        GGML_ASSERT(n_chars <= (int32_t)text.size());  // whitespace trimming is performed after per-token detokenization
3685    }
3686
3687    text.resize(n_chars);
3688
3689    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
3690    return text;
3691}
3692
3693void llama_vocab::print_info() const {
3694    pimpl->print_info();
3695}
3696
3697//
3698// interface implementation
3699//
3700
3701int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
3702    return vocab->n_tokens();
3703}
3704
3705// deprecated
3706int32_t llama_n_vocab(const struct llama_vocab * vocab) {
3707    return llama_vocab_n_tokens(vocab);
3708}
3709
3710enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
3711    return vocab->get_type();
3712}
3713
3714const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
3715    return vocab->token_get_text(token);
3716}
3717
3718float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
3719    return vocab->token_get_score(token);
3720}
3721
3722enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
3723    return vocab->token_get_attr(token);
3724}
3725
3726bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
3727    return vocab->is_eog(token);
3728}
3729
3730bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
3731    return vocab->is_control(token);
3732}
3733
3734llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
3735    return vocab->token_bos();
3736}
3737
3738llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
3739    return vocab->token_eos();
3740}
3741
3742llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
3743    return vocab->token_eot();
3744}
3745
3746// deprecated
3747llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
3748    return vocab->token_bos();
3749}
3750
3751llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
3752    return vocab->token_sep();
3753}
3754
3755llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
3756    return vocab->token_nl();
3757}
3758
3759llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
3760    return vocab->token_pad();
3761}
3762
3763bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
3764    return vocab->get_add_bos();
3765}
3766
3767bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3768    return vocab->get_add_eos();
3769}
3770
3771bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3772    return vocab->get_add_sep();
3773}
3774
3775llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3776    return vocab->token_fim_pre();
3777}
3778
3779llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
3780    return vocab->token_fim_suf();
3781}
3782
3783llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
3784    return vocab->token_fim_mid();
3785}
3786
3787llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
3788    return vocab->token_fim_pad();
3789}
3790
3791llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
3792    return vocab->token_fim_rep();
3793}
3794
3795llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
3796    return vocab->token_fim_sep();
3797}
3798
3799llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
3800    return vocab->token_mask();
3801}
3802
3803// deprecated
3804const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
3805    return llama_vocab_get_text(vocab, token);
3806}
3807
3808// deprecated
3809float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
3810    return llama_vocab_get_score(vocab, token);
3811}
3812
3813// deprecated
3814enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
3815    return llama_vocab_get_attr(vocab, token);
3816}
3817
3818// deprecated
3819bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
3820    return llama_vocab_is_eog(vocab, token);
3821}
3822
3823// deprecated
3824bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
3825    return llama_vocab_is_control(vocab, token);
3826}
3827
3828// deprecated
3829llama_token llama_token_bos(const struct llama_vocab * vocab) {
3830    return llama_vocab_bos(vocab);
3831}
3832
3833// deprecated
3834llama_token llama_token_eos(const struct llama_vocab * vocab) {
3835    return llama_vocab_eos(vocab);
3836}
3837
3838// deprecated
3839llama_token llama_token_eot(const struct llama_vocab * vocab) {
3840    return llama_vocab_eot(vocab);
3841}
3842
3843// deprecated
3844llama_token llama_token_cls(const struct llama_vocab * vocab) {
3845    //return llama_vocab_cls(vocab);
3846    return llama_vocab_bos(vocab); // avoid deprecation warning
3847}
3848
3849// deprecated
3850llama_token llama_token_sep(const struct llama_vocab * vocab) {
3851    return llama_vocab_sep(vocab);
3852}
3853
3854// deprecated
3855llama_token llama_token_nl (const struct llama_vocab * vocab) {
3856    return llama_vocab_nl(vocab);
3857}
3858
3859// deprecated
3860llama_token llama_token_pad(const struct llama_vocab * vocab) {
3861    return llama_vocab_pad(vocab);
3862}
3863
3864// deprecated
3865bool llama_add_bos_token(const struct llama_vocab * vocab) {
3866    return llama_vocab_get_add_bos(vocab);
3867}
3868
3869// deprecated
3870bool llama_add_eos_token(const struct llama_vocab * vocab) {
3871    return llama_vocab_get_add_eos(vocab);
3872}
3873
3874// deprecated
3875llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
3876    return llama_vocab_fim_pre(vocab);
3877}
3878
3879// deprecated
3880llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
3881    return llama_vocab_fim_suf(vocab);
3882}
3883
3884// deprecated
3885llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
3886    return llama_vocab_fim_mid(vocab);
3887}
3888
3889// deprecated
3890llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
3891    return llama_vocab_fim_pad(vocab);
3892}
3893
3894// deprecated
3895llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
3896    return llama_vocab_fim_rep(vocab);
3897}
3898
3899// deprecated
3900llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
3901    return llama_vocab_fim_sep(vocab);
3902}
3903
3904//
3905// tokenization
3906//
3907
3908int32_t llama_tokenize(
3909    const struct llama_vocab * vocab,
3910                  const char * text,
3911                     int32_t   text_len,
3912                 llama_token * tokens,
3913                     int32_t   n_tokens_max,
3914                        bool   add_special,
3915                        bool   parse_special) {
3916    return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
3917}
3918
3919int32_t llama_token_to_piece(
3920    const struct llama_vocab * vocab,
3921                 llama_token   token,
3922                        char * buf,
3923                     int32_t   length,
3924                     int32_t   lstrip,
3925                        bool   special) {
3926    return vocab->token_to_piece(token, buf, length, lstrip, special);
3927}
3928
3929int32_t llama_detokenize(
3930    const struct llama_vocab * vocab,
3931           const llama_token * tokens,
3932                     int32_t   n_tokens,
3933                        char * text,
3934                     int32_t   text_len_max,
3935                        bool   remove_special,
3936                        bool   unparse_special) {
3937    return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3938}