1#include "llama-vocab.h"
2
3#include "ggml.h"
4#include "gguf.h"
5#include "llama-impl.h"
6#include "llama-model-loader.h"
7
8#include "unicode.h"
9
10#include <algorithm>
11#include <cassert>
12#include <cctype>
13#include <cfloat>
14#include <cmath>
15#include <cstdarg>
16#include <cstring>
17#include <forward_list>
18#include <limits>
19#include <map>
20#include <queue>
21#include <set>
22#include <unordered_map>
23
24//
25// helpers
26//
27
28struct naive_trie {
29 naive_trie() : has_value(false), value(0) {
30 }
31 void insert(const char * key, size_t len, int32_t value = 0) {
32 if (len == 0) {
33 this->has_value = true;
34 this->value = value;
35 return;
36 }
37 char c = key[0];
38 auto res = children.find(c);
39 if (res != children.end()) {
40 res->second.insert(key + 1, len - 1, value);
41 } else {
42 auto res = children.insert(std::make_pair(c, naive_trie()));
43 res.first->second.insert(key + 1, len - 1, value);
44 }
45 }
46 std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
47 if (len == 0 || offset == len) {
48 return std::make_pair(key, offset);
49 }
50 char c = key[offset];
51 auto res = children.find(c);
52 if (res != children.end()) {
53 return res->second.get_longest_prefix(key, len, offset + 1);
54 }
55
56 return std::make_pair(key, offset);
57 }
58 const struct naive_trie * traverse(const char c) const {
59 auto res = children.find(c);
60 if (res != children.end()) {
61 return &res->second;
62 }
63
64 return NULL;
65 }
66 std::map<char, struct naive_trie> children;
67 bool has_value;
68 llama_token value;
69};
70
71//
72// tokenizers
73//
74
75struct llm_tokenizer {
76 llm_tokenizer() {}
77 virtual ~llm_tokenizer() = default;
78};
79
80struct llm_symbol {
81 using index = int;
82 index prev;
83 index next;
84 const char * text;
85 size_t n;
86};
87
88static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
89
90//
91// SPM tokenizer
92// original implementation:
93// https://github.com/ggml-org/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
94//
95
96struct llm_bigram_spm {
97 struct comparator {
98 bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
99 return (l.score < r.score) || (l.score == r.score && l.left > r.left);
100 }
101 };
102 using queue_storage = std::vector<llm_bigram_spm>;
103 using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
104 llm_symbol::index left;
105 llm_symbol::index right;
106 float score;
107 size_t size;
108};
109
110struct llm_tokenizer_spm : llm_tokenizer {
111 llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
112};
113
114struct llm_tokenizer_spm_session {
115 llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
116
117 void tokenize(const std::string & text, std::vector<llama_token> & output) {
118 // split string into utf8 chars
119 int index = 0;
120 size_t offs = 0;
121 while (offs < text.size()) {
122 llm_symbol sym;
123 size_t len = unicode_len_utf8(text[offs]);
124 sym.text = text.c_str() + offs;
125 sym.n = std::min(len, text.size() - offs);
126 offs += sym.n;
127 sym.prev = index - 1;
128 sym.next = offs == text.size() ? -1 : index + 1;
129 index++;
130 symbols.emplace_back(sym);
131 }
132
133 // seed the work queue with all possible 2-character tokens.
134 for (int i = 1; i < (int) symbols.size(); ++i) {
135 try_add_bigram(i - 1, i);
136 }
137
138 // keep substituting the highest frequency pairs for as long as we can.
139 while (!work_queue.empty()) {
140 auto bigram = work_queue.top();
141 work_queue.pop();
142
143 auto & left_sym = symbols[bigram.left];
144 auto & right_sym = symbols[bigram.right];
145
146 // if one of the symbols already got merged, skip it.
147 if (left_sym.n == 0 || right_sym.n == 0 ||
148 left_sym.n + right_sym.n != bigram.size) {
149 continue;
150 }
151
152 // merge the right sym into the left one
153 left_sym.n += right_sym.n;
154 right_sym.n = 0;
155
156 //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
157
158 // remove the right sym from the chain
159 left_sym.next = right_sym.next;
160 if (right_sym.next >= 0) {
161 symbols[right_sym.next].prev = bigram.left;
162 }
163
164 // find more substitutions
165 try_add_bigram(left_sym.prev, bigram.left);
166 try_add_bigram(bigram.left, left_sym.next);
167 }
168
169 for (int i = 0; i != -1; i = symbols[i].next) {
170 auto & symbol = symbols[i];
171 resegment(symbol, output);
172 }
173 }
174
175private:
176 void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
177 auto text = std::string(symbol.text, symbol.n);
178 auto token = vocab.text_to_token(text);
179
180 // Do we need to support is_unused?
181 if (token != LLAMA_TOKEN_NULL) {
182 output.push_back(token);
183 return;
184 }
185
186 const auto p = rev_merge.find(text);
187
188 if (p == rev_merge.end()) {
189 // output any symbols that did not form tokens as bytes.
190 output.reserve(output.size() + symbol.n);
191 for (int j = 0; j < (int)symbol.n; ++j) {
192 llama_token id = vocab.byte_to_token(symbol.text[j]);
193 output.push_back(id);
194 }
195 return;
196 }
197
198 resegment(symbols[p->second.first], output);
199 resegment(symbols[p->second.second], output);
200 }
201
202 void try_add_bigram(int left, int right) {
203 if (left == -1 || right == -1) {
204 return;
205 }
206 const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
207 auto token = vocab.text_to_token(text);
208
209 if (token == LLAMA_TOKEN_NULL) {
210 return;
211 }
212
213 if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
214 return;
215 }
216
217 const auto & tok_data = vocab.get_token_data(token);
218
219 llm_bigram_spm bigram;
220 bigram.left = left;
221 bigram.right = right;
222 bigram.score = tok_data.score;
223 bigram.size = text.size();
224
225 work_queue.push(bigram);
226
227 // Do we need to support is_unused?
228 rev_merge[text] = std::make_pair(left, right);
229 }
230
231 const llama_vocab & vocab;
232 // currently unused
233 // const llm_tokenizer_spm * spm_tokenizer;
234
235 std::vector<llm_symbol> symbols;
236 llm_bigram_spm::queue work_queue;
237 std::map<std::string, std::pair<int, int>> rev_merge;
238};
239
240//
241// BPE tokenizer
242// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
243// tried to simplify unicode stuff, so most likely does not work 100% correctly!
244//
245
246// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
247
248template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
249class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
250public:
251 using std::priority_queue<T, Container, Compare>::priority_queue;
252
253 T pop_move() {
254 T item = std::move(this->c.front());
255 std::pop_heap(this->c.begin(), this->c.end(), this->comp);
256 this->c.pop_back();
257 return item;
258 }
259
260 void pop() = delete;
261};
262
263struct llm_bigram_bpe {
264 struct comparator {
265 bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
266 return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
267 }
268 };
269
270 using queue_storage = std::vector<llm_bigram_bpe>;
271 using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
272 llm_symbol::index left;
273 llm_symbol::index right;
274 std::string text;
275 int rank;
276 size_t size;
277};
278
279struct llm_tokenizer_bpe : llm_tokenizer {
280 llm_tokenizer_bpe(const llama_vocab & vocab) {
281 GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
282 switch (vocab.get_pre_type()) {
283 case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
284 regex_exprs = {
285 // original regex from tokenizer.json
286 //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
287
288 // adapted: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2080233989
289 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
290 };
291 break;
292 case LLAMA_VOCAB_PRE_TYPE_DBRX:
293 case LLAMA_VOCAB_PRE_TYPE_SMAUG:
294 regex_exprs = {
295 // same as llama3
296 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
297 };
298 break;
299 case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
300 regex_exprs = {
301 "[\r\n]",
302 "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
303 "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
304 "\\s+$",
305 "[一-龥ࠀ-一가-]+",
306 "\\p{N}+",
307 };
308 break;
309 case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
310 case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
311 regex_exprs = {
312 "\\p{N}{1,3}",
313 "[一-龥-ゟ゠-ヿ]+",
314 "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
315 };
316 break;
317 case LLAMA_VOCAB_PRE_TYPE_YOUTU:
318 regex_exprs = {
319 "[가-힣ㄱ-ㆎ]+|[!…“”‘’—:;,、-〿︰-﹏]+|[ㄅ-ㄯ]+|[一-龥-ゟ゠-ヿ]+",
320 "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
321 };
322 break;
323 case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
324 regex_exprs = {
325 "[\r\n]",
326 "\\s?\\p{L}+",
327 "\\s?\\p{P}+",
328 "[一-龥ࠀ-一가-]+",
329 "\\p{N}",
330 };
331 break;
332 case LLAMA_VOCAB_PRE_TYPE_FALCON:
333 regex_exprs = {
334 "[\\p{P}\\$\\+<=>\\^~\\|`]+",
335 "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
336 "[0-9][0-9][0-9]",
337 };
338 break;
339 case LLAMA_VOCAB_PRE_TYPE_STARCODER:
340 case LLAMA_VOCAB_PRE_TYPE_REFACT:
341 case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
342 case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
343 case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
344 case LLAMA_VOCAB_PRE_TYPE_EXAONE:
345 case LLAMA_VOCAB_PRE_TYPE_MINERVA:
346 regex_exprs = {
347 "\\p{N}",
348 "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
349 };
350 break;
351 case LLAMA_VOCAB_PRE_TYPE_GPT2:
352 case LLAMA_VOCAB_PRE_TYPE_MPT:
353 case LLAMA_VOCAB_PRE_TYPE_OLMO:
354 case LLAMA_VOCAB_PRE_TYPE_JAIS:
355 case LLAMA_VOCAB_PRE_TYPE_TRILLION:
356 case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
357 regex_exprs = {
358 "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
359 };
360 break;
361 case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
362 case LLAMA_VOCAB_PRE_TYPE_QWEN2:
363 case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
364 case LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN:
365 regex_exprs = {
366 // original regex from tokenizer.json
367 // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
368 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
369 };
370 break;
371 case LLAMA_VOCAB_PRE_TYPE_QWEN35:
372 regex_exprs = {
373 // original regex from tokenizer.json
374 // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
375 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
376 };
377 break;
378 case LLAMA_VOCAB_PRE_TYPE_PORO:
379 case LLAMA_VOCAB_PRE_TYPE_BLOOM:
380 case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
381 regex_exprs = {
382 " ?[^(\\s|.,!?…。,、।۔،)]+",
383 };
384 break;
385 case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
386 regex_exprs = {
387 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
388 };
389 break;
390 case LLAMA_VOCAB_PRE_TYPE_VIKING:
391 regex_exprs = {
392 " ?[^(\\s|.,!?…。,、।۔،)]+",
393 "\\p{N}",
394 };
395 break;
396 case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
397 // original regex from tokenizer.json
398 // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
399 regex_exprs = {
400 "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
401 };
402 break;
403 case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
404 // Note: in theory, the special token (sentinel and image token) regex_exprs below
405 // are unnecessary, as they are split in `tokenizer_st_partition` anyway.
406 // However, since the upstream pre-tokenizer uses them, they are also
407 // included here (see https://huggingface.co/facebook/chameleon-7b).
408 regex_exprs = {
409 "<sentinel:[0-9]+>", // Sentinel tokens
410 "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens
411 "([\\t\\n]| | )", // directly from tokenizer.json
412 "\\p{N}", // Individual digits
413 "[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated
414 "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
415 };
416 break;
417 case LLAMA_VOCAB_PRE_TYPE_GPT4O:
418 case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2:
419 regex_exprs = {
420 // original regex from tokenizer.json
421 // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
422 "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
423 };
424 break;
425 case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
426 regex_exprs = {
427 // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
428 // The custom handler implements all K2 patterns with proper Han character exclusion
429 "\\p{Han}+",
430 };
431 break;
432 case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
433 regex_exprs = {
434 "\\p{N}+",
435 "(?=(\\d{3})+(?!\\d))",
436 };
437 break;
438 case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
439 regex_exprs = {
440 // original regex from tokenizer.json
441 // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
442 // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
443 "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
444 };
445 break;
446 case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
447 regex_exprs = {
448 // original regex from tokenizer.json
449 // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
450 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
451 };
452 break;
453 case LLAMA_VOCAB_PRE_TYPE_GROK_2:
454 regex_exprs = {
455 // original regex from tokenizer.json
456 // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
457 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
458 };
459 break;
460 case LLAMA_VOCAB_PRE_TYPE_AFMOE:
461 regex_exprs = {
462 // Digit handling - uses custom implementation in unicode.cpp
463 // Groups digits with leading 1-2 based on total length modulo 3
464 "\\p{AFMoE_digits}",
465 // CJK and Asian scripts (using direct Unicode literals)
466 "[一-鿿㐀-䶿豈--ゟ゠-ヿ・-゚⼀-เ--ក-က-႟ꩠ-ꩿꧠ-가-ᄀ-ᇿ]+",
467 // Main BPE pattern
468 "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
469 };
470 break;
471 case LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE:
472 regex_exprs = {
473 // original regex from tokenizer.json
474 // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
475 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
476 };
477 break;
478 default:
479 // default regex for BPE tokenization pre-processing
480 regex_exprs = {
481 "[\\p{P}\\$\\+<=>\\^~\\|]+",
482 "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
483 "\\p{N}+",
484 "[0-9][0-9][0-9]",
485 };
486 break;
487 }
488 }
489
490 std::vector<std::string> regex_exprs;
491};
492
493struct llm_tokenizer_bpe_session {
494 llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
495
496 static void append(const llama_token token_id, std::vector<llama_token> & output) {
497 output.push_back(token_id);
498 }
499
500 bool append_bos(std::vector<llama_token> & output) const {
501 if (vocab.get_add_bos()) {
502 GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
503 output.push_back(vocab.token_bos());
504 return true;
505 }
506 return false;
507 }
508
509 bool append_eos(std::vector<llama_token> & output) const {
510 if (vocab.get_add_eos()) {
511 GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
512 output.push_back(vocab.token_eos());
513 return true;
514 }
515 return false;
516 }
517
518 void check_double_bos_eos(const std::vector<llama_token> & output) const {
519 if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
520 LLAMA_LOG_WARN(
521 "%s: Added a BOS token to the prompt as specified by the model but the prompt "
522 "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
523 "Are you sure this is what you want?\n", __FUNCTION__);
524 }
525 if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
526 LLAMA_LOG_WARN(
527 "%s: Added a EOS token to the prompt as specified by the model but the prompt "
528 "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
529 "Are you sure this is what you want?\n", __FUNCTION__);
530 }
531 }
532
533 void tokenize(const std::string & text, std::vector<llama_token> & output) {
534 int final_prev_index = -1;
535 const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
536
537 symbols_final.clear();
538
539 for (const auto & word : word_collection) {
540 work_queue = llm_bigram_bpe::queue();
541 symbols.clear();
542
543 int index = 0;
544 size_t offset = 0;
545
546 //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
547 if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
548 symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
549 offset = word.size();
550 }
551
552 while (offset < word.size()) {
553 llm_symbol sym;
554 size_t char_len = std::min(word.size() - offset, (size_t) unicode_len_utf8(word[offset]));
555 sym.text = word.c_str() + offset;
556 sym.n = char_len;
557 offset += sym.n;
558 sym.prev = index - 1;
559 sym.next = offset == word.size() ? -1 : index + 1;
560 index++;
561 symbols.emplace_back(sym);
562 }
563 for (int i = 1; i < (int) symbols.size(); ++i) {
564 add_new_bigram(i - 1, i);
565 }
566
567 // build token(s)
568 while (!work_queue.empty()) {
569 auto bigram = work_queue.pop_move();
570
571 auto & left_symbol = symbols[bigram.left];
572 auto & right_symbol = symbols[bigram.right];
573
574 if (left_symbol.n == 0 || right_symbol.n == 0) {
575 continue;
576 }
577 std::string left_token = std::string(left_symbol.text, left_symbol.n);
578 std::string right_token = std::string(right_symbol.text, right_symbol.n);
579 if (left_token + right_token != bigram.text) {
580 continue; // Skip this bigram if it's outdated
581 }
582
583 // merge the right sym into the left one
584 left_symbol.n += right_symbol.n;
585 right_symbol.n = 0;
586
587 // remove the right sym from the chain
588 left_symbol.next = right_symbol.next;
589 if (right_symbol.next >= 0) {
590 symbols[right_symbol.next].prev = bigram.left;
591 }
592
593 add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol
594 add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
595 }
596
597 // add the finished tokens to the final list keeping correct order for next and prev
598 for (auto & sym : symbols) {
599 if (sym.n > 0) {
600 sym.prev = final_prev_index;
601 sym.next = -1;
602 if (final_prev_index != -1) {
603 symbols_final[final_prev_index].next = symbols_final.size();
604 }
605 symbols_final.emplace_back(sym);
606 final_prev_index = symbols_final.size() - 1;
607 }
608 }
609 }
610
611 symbols = symbols_final;
612
613 if (!symbols.empty()) {
614 for (int i = 0; i != -1; i = symbols[i].next) {
615 auto & symbol = symbols[i];
616 if (symbol.n == 0) {
617 continue;
618 }
619
620 const std::string str = std::string(symbol.text, symbol.n);
621 const auto token = vocab.text_to_token(str);
622
623 if (token == LLAMA_TOKEN_NULL) {
624 for (auto j = str.begin(); j != str.end(); ++j) {
625 std::string byte_str(1, *j);
626 auto token_multibyte = vocab.text_to_token(byte_str);
627 if (token_multibyte != LLAMA_TOKEN_NULL) {
628 output.push_back(token_multibyte);
629 }
630 }
631 } else {
632 output.push_back(token);
633 }
634 }
635 }
636 }
637
638private:
639 void add_new_bigram(int left, int right) {
640 if (left == -1 || right == -1) {
641 return;
642 }
643 std::string left_token = std::string(symbols[left].text, symbols[left].n);
644 std::string right_token = std::string(symbols[right].text, symbols[right].n);
645
646 int rank_found = -1;
647
648 rank_found = vocab.find_bpe_rank(left_token, right_token);
649
650 if (rank_found < 0) {
651 return;
652 }
653
654 llm_bigram_bpe bigram;
655
656 bigram.left = left;
657 bigram.right = right;
658 bigram.text = left_token + right_token;
659 bigram.size = left_token.size() + right_token.size();
660 bigram.rank = rank_found;
661
662 work_queue.push(bigram);
663 }
664
665 const llama_vocab & vocab;
666 const llm_tokenizer_bpe & tokenizer;
667
668 std::vector<llm_symbol> symbols;
669 std::vector<llm_symbol> symbols_final;
670 llm_bigram_bpe::queue work_queue;
671};
672
673//
674// WPM tokenizer
675//
676
677struct llm_tokenizer_wpm : llm_tokenizer {
678 llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
679};
680
681struct llm_tokenizer_wpm_session {
682 llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
683
684 void tokenize(const std::string & text, std::vector<llama_token> & output) {
685 // normalize and split by whitespace
686 std::vector<std::string> words = preprocess(text);
687 // bos token prepended already
688
689 // find the longest tokens that form the words
690 for (const std::string & word : words) {
691 // skip empty words
692 if (word.size() == 0) {
693 continue;
694 }
695
696 // prepend phantom space
697 const std::string word1 = "\xe2\x96\x81" + word;
698 const int n = word1.size();
699
700 const size_t current_tokens = output.size();
701
702 // we're at the start of a new word
703 // move through character position in word
704 for (int i = 0; i < n; ++i) {
705 // loop through possible match length
706 bool match = false;
707 for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) {
708 auto id = vocab.text_to_token(word1.substr(i, j - i));
709 if (id != LLAMA_TOKEN_NULL) {
710 output.push_back(id);
711 match = true;
712 i = j - 1;
713 break;
714 }
715 }
716
717 if (!match) { // discard all
718 output.resize(current_tokens);
719 break; // and discard next tokens
720 }
721 }
722
723 // we didn't find any matches for this word
724 if (current_tokens == output.size()) {
725 output.push_back(vocab.token_unk());
726 }
727 }
728 }
729
730 // TODO: reduce string copies by using cpts_offs array
731 static std::vector<std::string> preprocess(const std::string & text) {
732 const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
733 std::vector<std::string> words(1, "");
734
735 for (const uint32_t cpt : cpts_nfd) {
736 const auto flags = unicode_cpt_flags_from_cpt(cpt);
737
738 if (flags.is_whitespace) {
739 if (words.back().size()) { // finish previous word if any
740 words.emplace_back();
741 }
742 continue;
743 }
744
745 assert (!flags.is_separator);
746 if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
747 continue;
748 }
749
750 const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
751 if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
752 if (words.back().size()) { // finish previous word if any
753 words.emplace_back();
754 }
755 words.back() = s; // single char word
756 words.emplace_back(); // start a new word
757 } else {
758 words.back() += s; // append char to word
759 }
760 }
761
762 if (!words.back().size()) {
763 words.pop_back();
764 }
765
766 return words;
767 }
768
769 static bool is_chinese_char(uint32_t cpt) {
770 return
771 (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
772 (cpt >= 0x03400 && cpt <= 0x04DBF) ||
773 (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
774 (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
775 (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
776 (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
777 (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
778 (cpt >= 0x2F800 && cpt <= 0x2FA1F);
779 //(cpt >= 0x3000 && cpt <= 0x303F) ||
780 //(cpt >= 0xFF00 && cpt <= 0xFFEF);
781 }
782
783private:
784 const llama_vocab & vocab;
785 // currently unused
786 // const llm_tokenizer_wpm * wpm_tokenizer;
787};
788
789//
790// UGM tokenizer
791//
792
793struct llm_tokenizer_ugm : llm_tokenizer {
794 llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
795 if (precompiled_charsmap.size() > 0) {
796 size_t charsmap_offset = 0;
797
798 // First four bytes of precompiled_charsmap contains length of binary
799 // blob containing XOR-compressed compact double array (XCDA) entries
800 uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
801 charsmap_offset += sizeof(xcda_blob_size);
802 if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
803 throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
804 }
805
806 // Next xcda_blob_size bytes contain entries of XOR-compressed compact
807 // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
808 xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
809 xcda_array_size = xcda_blob_size / sizeof(uint32_t);
810 charsmap_offset += xcda_blob_size;
811
812 // Remaining bytes of precompiled charsmap contain null-terminated
813 // replacement strings for prefixes matched by the XCDA.
814 prefix_replacements = &precompiled_charsmap[charsmap_offset];
815 prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
816 }
817
818 for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
819 const auto & token_data = vocab.get_token_data(id);
820
821 if (vocab.is_normal(id)) {
822 min_score = std::min<float>(min_score, token_data.score);
823 max_score = std::max<float>(max_score, token_data.score);
824 }
825
826 if (vocab.is_normal(id) ||
827 vocab.is_user_defined(id) ||
828 vocab.is_unused(id)) {
829 token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
830 }
831
832 if (vocab.is_user_defined(id)) {
833 user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
834 }
835 }
836
837 unknown_token_score = min_score - unknown_token_score_penalty;
838 }
839
840 // escaped space symbol - U+2581 (Lower One Eighth Block)
841 const std::string escaped_space = "\xE2\x96\x81";
842
843 const char * prefix_replacements = NULL;
844 size_t prefix_replacements_size = 0;
845
846 const uint32_t * xcda_array = NULL;
847 size_t xcda_array_size = 0;
848
849 struct naive_trie user_defined_token_matcher;
850
851 float min_score = FLT_MAX;
852 float max_score = -FLT_MAX;
853
854 float unknown_token_score_penalty = 10.0;
855 float unknown_token_score;
856
857 struct naive_trie token_matcher;
858};
859
860struct llm_tokenizer_ugm_session {
861 llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
862
863 /* This implementation is based on SentencePiece optimized Viterbi algorithm for
864 * unigram language models. The general idea is to:
865 * - move along the input sequence in steps of one UTF code point,
866 * - at each step find all possible tokenizations of the prefix by
867 * traversing the tokens trie,
868 * - for each tokenization store the best one so far (by higher score)
869 * - use the position in sequence after given token as an index to store
870 * results
871 * - if there was no valid tokenization of the current UTF code point
872 * then use unknown token with additional score penalty
873 * After processing the whole sequence we backtrack from the end to get
874 * the best tokenization.
875 */
876 void tokenize(const std::string & text, std::vector<llama_token> & output) {
877 // get current size of output (for reversal later)
878 size_t output_size = output.size();
879
880 // normalize the input first
881 std::string normalized;
882 normalize(text, &normalized);
883 size_t input_len = normalized.size();
884 if (input_len == 0) {
885 return;
886 }
887
888 // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
889 std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
890 // at the beginning tokenization score is zero
891 tokenization_results[0] = { vocab.token_unk(), 0, 0 };
892
893 for (size_t input_offset = 0; input_offset < input_len;) {
894 size_t prefix_offset = input_offset;
895 // calculate how many code units are in the currently processed UTF code point
896 size_t n_utf8_code_units = std::min<size_t>(unicode_len_utf8(normalized[input_offset]), input_len - input_offset);
897
898 // traverse the token matcher trie to find a matching token
899 bool single_codepoint_token_found = false;
900 const struct best_tokenization & current_best = tokenization_results[input_offset];
901 const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]);
902
903 while (prefix_offset <= input_len && node != NULL) {
904 // check if we found valid token in prefix
905 if (node->has_value) {
906 // check if it corresponds to the whole UTF code point
907 if (prefix_offset - input_offset == n_utf8_code_units) {
908 single_codepoint_token_found = true;
909 }
910 llama_token token_id = node->value;
911 const auto & token_data = vocab.get_token_data(token_id);
912
913 // we set the user-defined token scores to 0 to make them more likely to be selected
914 // (normal token scores are log probabilities, so they are negative)
915 // score type is double here to make tokenization results exactly
916 // the same as in the HF tokenizer using SentencePiece
917 const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score;
918 const double challenger_score = current_best.score_sum + token_score;
919 struct best_tokenization & current_champ = tokenization_results[prefix_offset];
920 if (challenger_score > current_champ.score_sum) {
921 struct best_tokenization challenger = { token_id, input_offset, challenger_score };
922 current_champ = challenger;
923 }
924 }
925 node = node->traverse(normalized[prefix_offset++]);
926 }
927
928 // if we didn't find a valid token corresponding to the whole UTF code point
929 // then use unknown token as the tokenization of this UTF code point
930 if (!single_codepoint_token_found) {
931 const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
932 prefix_offset = input_offset + n_utf8_code_units;
933 struct best_tokenization & current_champ = tokenization_results[prefix_offset];
934 if (challenger_score > current_champ.score_sum) {
935 struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
936 current_champ = challenger;
937 }
938 }
939
940 // move to the next UTF code point
941 input_offset += n_utf8_code_units;
942 }
943
944 // now backtrack from the end to gather token ids of the best tokenization
945 // merge sequences of consecutive unknown tokens into single unknown tokens
946 bool is_prev_unknown = false;
947 for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
948 bool is_unknown = tokenization.token_id == vocab.token_unk();
949 if (!(is_prev_unknown && is_unknown)) {
950 output.push_back(tokenization.token_id);
951 }
952 if (tokenization.input_offset == 0) {
953 break;
954 }
955 is_prev_unknown = is_unknown;
956 }
957
958 // reverse the output since we added tokens starting from the end of the input
959 std::reverse(output.begin() + output_size, output.end());
960 }
961
962private:
963
964 // helper structure for returning normalization results
965 struct normalization_result {
966 const char * normalized;
967 size_t normalized_len;
968 size_t consumed_input;
969 };
970
971 void normalize(const std::string& input, std::string * normalized) {
972 normalized->clear();
973 normalized->reserve(input.size() * 3);
974
975 const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
976
977 const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
978 const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
979 const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces();
980
981 bool is_space_prepended = false;
982 bool processing_non_ws = false;
983
984 size_t input_len = input.size();
985
986 for (size_t input_offset = 0; input_offset < input_len; ) {
987 auto norm_res = normalize_prefix(input, input_offset);
988 for (size_t i = 0; i < norm_res.normalized_len; i++) {
989 char c = norm_res.normalized[i];
990 if (c != ' ') {
991 if (!processing_non_ws) {
992 processing_non_ws = true;
993 if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
994 normalized->append(space);
995 is_space_prepended = true;
996 }
997 }
998 normalized->push_back(c);
999 } else {
1000 if (processing_non_ws) {
1001 processing_non_ws = false;
1002 }
1003 if (!shall_merge_spaces) {
1004 normalized->append(space);
1005 }
1006 }
1007 }
1008
1009 input_offset += norm_res.consumed_input;
1010 }
1011
1012 if (shall_append_space) {
1013 normalized->append(space);
1014 }
1015 }
1016
1017 /*
1018 * This structure is a view wrapper for XOR-compressed double array (XCDA)
1019 * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
1020 * Each bit-packed entry contains:
1021 * - BASE array value in bits 10-30
1022 * - LCHECK array value in bits 0-7
1023 * - LEAF array value in bit 9
1024 * Entries containing indexes of replacement sequences have set bit 31
1025 */
1026 struct xcda_array_view {
1027 public:
1028 xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
1029 }
1030 uint32_t get_base(size_t index) {
1031 uint32_t packed_node = get_node(index);
1032 return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
1033 }
1034 uint32_t get_lcheck(size_t index) {
1035 uint32_t packed_node = get_node(index);
1036 return packed_node & ((1U << 31) | 0xff);
1037 }
1038 bool get_leaf(size_t index) {
1039 uint32_t packed_node = get_node(index);
1040 return (packed_node >> 8) & 1;
1041 }
1042 uint32_t get_value(size_t index) {
1043 uint32_t packed_node = get_node(index);
1044 return packed_node & ((1U << 31) - 1);
1045 }
1046 private:
1047 uint32_t get_node(size_t index) {
1048 if (index >= xcda_array_size) {
1049 throw std::runtime_error("Index out of array bounds in XCDA array!");
1050 }
1051 return xcda_array[index];
1052 }
1053 const uint32_t * xcda_array;
1054 size_t xcda_array_size;
1055 };
1056
1057 // this structure stores the best tokenization so far at input_offset
1058 struct best_tokenization {
1059 llama_token token_id;
1060 size_t input_offset;
1061 double score_sum;
1062 };
1063
1064 struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
1065 if (input_offset == input.size()) {
1066 return { &input[input_offset], 0, 0 };
1067 }
1068
1069 // if input prefix matches some user-defined token return this token as normalization result
1070 auto user_defined_token_match =
1071 tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
1072 if (user_defined_token_match.second > 0) {
1073 return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
1074 }
1075
1076 size_t longest_prefix_length = 0;
1077 size_t longest_prefix_offset = 0;
1078
1079 if (tokenizer.xcda_array_size > 0) {
1080 struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
1081
1082 // Find the longest normalized sequence matching the input prefix by walking
1083 // the XOR-compressed compact double array (XCDA) starting from the root node
1084 // We find the index of the next node by calculating BASE[s] ^ c where s is
1085 // the index of the previous node and c is a numerical character value
1086 uint32_t node_index = 0;
1087 // get BASE of the root node
1088 node_index = xcda_view.get_base(node_index);
1089 for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
1090 unsigned char c = input[prefix_offset];
1091 if (c == 0) {
1092 break;
1093 }
1094 node_index ^= c;
1095 // if value of LCHECK is not c it means that this is not a child of
1096 // the previous node, so we stop matching
1097 if (xcda_view.get_lcheck(node_index) != c) {
1098 break;
1099 }
1100 bool is_leaf = xcda_view.get_leaf(node_index);
1101 // get BASE of the current node
1102 node_index ^= xcda_view.get_base(node_index);
1103 // if LEAF of the current node is true, it means that its BASE points to the node
1104 // containing index of replacement sequence for currently matched input prefix
1105 if (is_leaf)
1106 {
1107 longest_prefix_length = prefix_offset - input_offset + 1;
1108 // get index of replacement sequence for currently matched input prefix
1109 longest_prefix_offset = xcda_view.get_value(node_index);
1110 }
1111 }
1112 }
1113
1114 if (longest_prefix_length > 0) {
1115 // we have a match, so return the replacement sequence
1116 if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
1117 throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
1118 }
1119 const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
1120 return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
1121 }
1122
1123 // check if the input prefix contains a valid sequence of UTF-8 code units
1124 try {
1125 // if yes, return this sequence unmodified
1126 size_t prefix_offset = input_offset;
1127 unicode_cpt_from_utf8(input, prefix_offset);
1128 return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
1129 } catch (std::invalid_argument & /*ex*/) {
1130 // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
1131 return { "\xEF\xBF\xBD", 3, 1 };
1132 }
1133 }
1134
1135 const llama_vocab & vocab;
1136 const llm_tokenizer_ugm & tokenizer;
1137};
1138
1139//
1140// RWKV tokenizer
1141//
1142
1143static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
1144 std::vector<uint8_t> output;
1145 output.reserve(escaped.size());
1146
1147 // Parser state
1148 bool escaping = false;
1149 uint8_t hex_remaining = 0;
1150 uint8_t hex_acc = 0;
1151
1152 // Step through characters, performing parsing
1153 for (const char & c : escaped) {
1154 // If we're parsing a hex code, interpret the next character
1155 if (hex_remaining != 0) {
1156 uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0');
1157 hex_acc = (hex_acc << 4) + value;
1158
1159 hex_remaining -= 1;
1160 if (hex_remaining == 0) {
1161 output.push_back(hex_acc);
1162 hex_acc = 0;
1163 }
1164
1165 continue;
1166 }
1167
1168 // If we got an escape character, interpret it
1169 if (escaping) {
1170 if (c == 't') {
1171 output.push_back('\t');
1172 } else if (c == 'n') {
1173 output.push_back('\n');
1174 } else if (c == 'r') {
1175 output.push_back('\r');
1176 } else if (c == 'x') {
1177 hex_remaining = 2;
1178 } else {
1179 output.push_back(c);
1180 }
1181
1182 escaping = false;
1183 continue;
1184 }
1185
1186 if (c == '\\') {
1187 escaping = true;
1188 continue;
1189 }
1190
1191 output.push_back(c);
1192 }
1193
1194 return output;
1195}
1196
1197struct llm_tokenizer_rwkv : llm_tokenizer {
1198 llm_tokenizer_rwkv(const llama_vocab & vocab) {
1199 // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
1200 // For now, we decode the vocab here into the lookup we'll use for tokenization.
1201
1202 // build trie
1203 for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
1204 const auto & data = vocab.get_token_data(id);
1205 const auto text = llama_unescape_rwkv_token(data.text);
1206 token_matcher.insert((const char *) text.data(), text.size(), id);
1207 }
1208 }
1209
1210 struct naive_trie token_matcher;
1211};
1212
1213struct llm_tokenizer_rwkv_session {
1214 llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
1215
1216 void tokenize(const std::string & text, std::vector<llama_token> & output) {
1217 uint32_t position = 0;
1218 while (position < text.size()) {
1219 const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]);
1220 if (node == NULL) {
1221 // no matching token found, add unknown token
1222 output.push_back(vocab.token_unk());
1223 position += 1;
1224 continue;
1225 }
1226
1227 // traverse the trie to find the longest matching token
1228 uint32_t token_id = 0;
1229 uint32_t token_length = 0;
1230 while (node != NULL) {
1231 if (node->has_value) {
1232 token_id = node->value;
1233 token_length = position + 1;
1234 }
1235 node = node->traverse(text[++position]);
1236 }
1237
1238 // add the longest matching token
1239 output.push_back(token_id);
1240 position = token_length;
1241 }
1242 }
1243
1244private:
1245 const llama_vocab & vocab;
1246 const llm_tokenizer_rwkv & tokenizer;
1247};
1248
1249struct llm_tokenizer_plamo2 : llm_tokenizer {
1250 llm_tokenizer_plamo2(const llama_vocab & vocab) {
1251 build(vocab);
1252 }
1253
1254 void build(const llama_vocab & vocab) {
1255 // Reset internal structures
1256 tokens_.clear();
1257 bytes_.assign(256, 0);
1258 to_suffix_id_.clear();
1259 table_.clear();
1260
1261 // Build token list and byte mapping
1262 std::unordered_map<std::string, float> suffix_to_score;
1263 std::unordered_map<std::string, llama_token> token_to_id;
1264
1265 for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
1266 const auto & entry = vocab.get_token_data(token_id);
1267 tokens_.push_back(entry.text);
1268 token_to_id[entry.text] = static_cast<llama_token>(token_id);
1269
1270 // Handle byte tokens
1271 if (vocab.is_byte(token_id)) {
1272 if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
1273 std::string hex_str = entry.text.substr(3, 2);
1274 int byte_val = std::stoi(hex_str, nullptr, 16);
1275 bytes_[byte_val] = static_cast<llama_token>(token_id);
1276 }
1277 continue;
1278 }
1279
1280 // Add token and all its suffixes to suffix_to_score
1281 suffix_to_score[entry.text] = entry.score;
1282
1283 // Extract suffixes character by character (UTF-8 aware)
1284 std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
1285 for (size_t i = 1; i < cpts.size(); ++i) {
1286 std::string suffix;
1287 for (size_t j = i; j < cpts.size(); ++j) {
1288 suffix += unicode_cpt_to_utf8(cpts[j]);
1289 }
1290 if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
1291 suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
1292 }
1293 }
1294 }
1295
1296 // Check that all byte tokens are set
1297 for (int i = 0; i < 256; ++i) {
1298 if (bytes_[i] == 0) {
1299 throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
1300 }
1301 }
1302
1303 // Build suffix list in lexicographical order of reversed strings
1304 std::vector<std::string> suffixes;
1305 suffixes.reserve(suffix_to_score.size() + 1);
1306 for (const auto & pair : suffix_to_score) {
1307 suffixes.push_back(pair.first);
1308 }
1309 suffixes.push_back(""); // Empty suffix
1310
1311 std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
1312 std::string rev_a(a.rbegin(), a.rend());
1313 std::string rev_b(b.rbegin(), b.rend());
1314 return rev_a < rev_b;
1315 });
1316
1317 // Build suffix_to_id and to_suffix_id_
1318 std::unordered_map<std::string, int32_t> suffix_to_id;
1319 int32_t num_pieces = 0;
1320
1321 for (const auto & suffix : suffixes) {
1322 suffix_to_id[suffix] = num_pieces;
1323 if (!suffix.empty()) {
1324 std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1325
1326 std::string remaining;
1327 for (size_t i = 1; i < cpts.size(); ++i) {
1328 remaining += unicode_cpt_to_utf8(cpts[i]);
1329 }
1330
1331 int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
1332 to_suffix_id_[piece_code] = num_pieces;
1333
1334 // Count number of pieces for this suffix
1335 int32_t pieces_for_suffix = 1; // sentinel row
1336 for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1337 std::string piece;
1338 for (int32_t i = 0; i < piece_length; ++i) {
1339 piece += unicode_cpt_to_utf8(cpts[i]);
1340 }
1341 if (suffix_to_score.find(piece) != suffix_to_score.end()) {
1342 pieces_for_suffix++;
1343 }
1344 }
1345 num_pieces += pieces_for_suffix;
1346 } else {
1347 num_pieces++; // Empty suffix contributes one piece (sentinel row)
1348 }
1349 }
1350
1351 // Build flattened table
1352 table_.resize(num_pieces, std::vector<int32_t>(4, 0));
1353 int32_t table_idx = 0;
1354
1355 for (const auto & suffix : suffixes) {
1356 // Add all prefixes of the suffix to the table (in decreasing order of length)
1357 std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1358 for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1359 std::string piece;
1360 for (int32_t i = 0; i < piece_length; ++i) {
1361 piece += unicode_cpt_to_utf8(cpts[i]);
1362 }
1363
1364 auto score_it = suffix_to_score.find(piece);
1365 if (score_it == suffix_to_score.end()) {
1366 continue;
1367 }
1368
1369 table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
1370 auto token_it = token_to_id.find(piece);
1371 table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
1372
1373 float score = score_it->second;
1374 table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
1375 static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
1376 table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
1377
1378 table_idx++;
1379 }
1380
1381 // Add sentinel row
1382 table_[table_idx][TABLE_PIECE_LENGTH] = 1;
1383 table_[table_idx][TABLE_TOKEN_ID] = -1;
1384 table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
1385 table_idx++;
1386 }
1387 }
1388
1389 std::vector<llama_token> encode(const std::string & text) const {
1390 std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
1391 // Skip the first code point if it is a BOM (Byte Order Mark)
1392 if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
1393 unicode_data.erase(unicode_data.begin());
1394 }
1395
1396 if (unicode_data.empty()) {
1397 return {};
1398 }
1399
1400 const size_t data_len = unicode_data.size();
1401
1402 // Initialize scores array (dynamic programming)
1403 std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
1404 scores[data_len] = 0;
1405
1406 // Path array to track best tokenization
1407 std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
1408
1409 int32_t suffix_id = 0;
1410
1411 // Process from end to beginning
1412 for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
1413 uint32_t c = unicode_data[i];
1414
1415 // Find next suffix ID
1416 for (size_t p = suffix_id; p < table_.size(); ++p) {
1417 int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
1418 auto it = to_suffix_id_.find(piece_code);
1419 suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
1420
1421 if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
1422 break;
1423 }
1424 }
1425
1426 // Update best path
1427 for (size_t p = suffix_id; p < table_.size(); ++p) {
1428 int32_t score = table_[p][TABLE_SCORE];
1429 if (score > INVALID_SCORE) {
1430 int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
1431 int64_t s = scores[i + piece_length] - score;
1432
1433 if (s < scores[i]) {
1434 scores[i] = s;
1435 path[i][PATH_TOKEN_LENGTH] = piece_length;
1436 path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
1437 path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
1438
1439 if (score == UNKNOWN_SCORE) {
1440 // Add UTF-8 byte count
1441 path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1442 }
1443 }
1444 }
1445
1446 if (score == UNKNOWN_SCORE) {
1447 break;
1448 }
1449 }
1450 }
1451
1452 // Decode the best path
1453 std::vector<llama_token> token_ids;
1454 token_ids.reserve(path[0][PATH_NUM_TOKENS]);
1455
1456 int pos = 0;
1457 while (pos < static_cast<int>(data_len)) {
1458 if (path[pos][PATH_TOKEN_ID] >= 0) {
1459 token_ids.push_back(path[pos][PATH_TOKEN_ID]);
1460 } else {
1461 // Fall back to byte tokens
1462 uint32_t c = unicode_data[pos];
1463 int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1464
1465 for (int i = 0; i < s; ++i) {
1466 uint8_t b;
1467 if (s == 1) {
1468 b = c;
1469 } else {
1470 if (i == 0) {
1471 b = (0xF00 >> s) & 0xFF;
1472 } else {
1473 b = 0x80;
1474 }
1475 }
1476 token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
1477 }
1478 }
1479
1480 assert(path[pos][PATH_TOKEN_LENGTH] > 0);
1481 pos += path[pos][PATH_TOKEN_LENGTH];
1482 }
1483
1484 return token_ids;
1485 }
1486private:
1487 // Constants for table structure
1488 static constexpr int32_t TABLE_PIECE_LENGTH = 0;
1489 static constexpr int32_t TABLE_TOKEN_ID = 1;
1490 static constexpr int32_t TABLE_SCORE = 2;
1491 static constexpr int32_t TABLE_PIECE_ID = 3;
1492
1493 // Constants for path array
1494 static constexpr int32_t PATH_TOKEN_LENGTH = 0;
1495 static constexpr int32_t PATH_TOKEN_ID = 1;
1496 static constexpr int32_t PATH_NUM_TOKENS = 2;
1497
1498 // Score constants
1499 static constexpr int32_t INVALID_SCORE = -20000000;
1500 static constexpr int32_t UNKNOWN_SCORE = -10000000;
1501
1502 // List of tokens in the vocabulary
1503 std::vector<std::string> tokens_;
1504
1505 // Mapping from byte code point to token ID (for byte fallback)
1506 std::vector<llama_token> bytes_;
1507
1508 // Mapping from piece code to suffix ID
1509 std::unordered_map<int64_t, int32_t> to_suffix_id_;
1510
1511 // Flattened table representing the Trie structure
1512 // Each row contains: [piece_length, token_id, score, piece_id]
1513 std::vector<std::vector<int32_t>> table_;
1514};
1515
1516struct llm_tokenizer_plamo2_session {
1517 llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
1518
1519 void tokenize(const std::string & text, std::vector<llama_token> & output) {
1520 std::vector<llama_token> tokens = tokenizer.encode(text);
1521 output.insert(output.end(), tokens.begin(), tokens.end());
1522 }
1523
1524private:
1525 const llm_tokenizer_plamo2 & tokenizer;
1526};
1527
1528//
1529// impl
1530//
1531
1532typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
1533 FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
1534 FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
1535} FRAGMENT_BUFFER_VARIANT_TYPE;
1536
1537struct fragment_buffer_variant {
1538 fragment_buffer_variant(llama_token _token)
1539 :
1540 type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
1541 token(_token),
1542 raw_text(_dummy),
1543 offset(0),
1544 length(0) {}
1545
1546 fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
1547 :
1548 type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
1549 token((llama_token) - 1),
1550 raw_text(_raw_text),
1551 offset(_offset),
1552 length(_length){
1553 GGML_ASSERT(_offset >= 0);
1554 GGML_ASSERT(_length >= 1);
1555 GGML_ASSERT(offset + length <= raw_text.length());
1556 }
1557
1558 const FRAGMENT_BUFFER_VARIANT_TYPE type;
1559 const llama_token token;
1560 const std::string _dummy;
1561 const std::string & raw_text;
1562 const uint64_t offset;
1563 const uint64_t length;
1564};
1565
1566struct llama_vocab::impl {
1567 uint32_t n_token_types = 0; // for BERT-style token types
1568
1569 std::string tokenizer_model;
1570 std::string tokenizer_pre;
1571
1572 enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
1573 enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1574
1575 int max_token_len = 0; // used for optimizing longest token search
1576
1577 // default LLaMA special tokens
1578 // TODO: should we set all of these to LLAMA_TOKEN_NULL?
1579 llama_token special_bos_id = 1;
1580 llama_token special_eos_id = 2;
1581 llama_token special_eot_id = LLAMA_TOKEN_NULL;
1582 llama_token special_eom_id = LLAMA_TOKEN_NULL;
1583 llama_token special_unk_id = 0;
1584 llama_token special_sep_id = LLAMA_TOKEN_NULL;
1585 llama_token special_pad_id = LLAMA_TOKEN_NULL;
1586 llama_token special_mask_id = LLAMA_TOKEN_NULL;
1587
1588 llama_token linefeed_id = 13;
1589
1590 // fim tokens
1591 llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
1592 llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
1593 llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
1594 llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
1595 llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
1596 llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
1597
1598 // tokenizer flags
1599 bool add_space_prefix = false;
1600 bool add_bos = false;
1601 bool add_eos = false;
1602 bool add_sep = false;
1603 bool ignore_merges = false;
1604 bool clean_spaces = false; // clean_up_tokenization_spaces
1605 bool remove_extra_whitespaces = false;
1606 bool escape_whitespaces = true;
1607 bool treat_whitespace_as_suffix = false;
1608
1609 std::unordered_map<std::string, llama_token> token_to_id;
1610 std::vector<token_data> id_to_token;
1611
1612 std::vector<llama_token> cache_special_tokens;
1613 std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
1614 struct pair_hash {
1615 size_t operator()(const std::pair<std::string, std::string> & p) const {
1616 return std::hash<std::string>{}(p.first) ^ //create some hash for pair
1617 (std::hash<std::string>{}(p.second) << 1);
1618 }
1619 };
1620 std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
1621
1622 // set of all tokens that cause "end of generation"
1623 std::set<llama_token> special_eog_ids;
1624
1625 std::unique_ptr<llm_tokenizer> tokenizer;
1626
1627 std::vector<char> precompiled_charsmap;
1628
1629 impl(const llama_vocab & vocab) : vocab(vocab) {
1630 }
1631
1632 ~impl() = default;
1633
1634 void load(llama_model_loader & ml, const LLM_KV & kv);
1635
1636 enum llama_vocab_type get_type() const;
1637
1638 std::string type_name() const;
1639
1640 bool is_normal (llama_token id) const;
1641 bool is_unknown (llama_token id) const;
1642 bool is_control (llama_token id) const;
1643 bool is_byte (llama_token id) const;
1644 bool is_user_defined(llama_token id) const;
1645 bool is_unused (llama_token id) const;
1646 bool is_eog (llama_token id) const;
1647
1648 uint8_t token_to_byte(llama_token id) const;
1649
1650 llama_token_attr token_get_attr(llama_token id) const;
1651
1652 void init_tokenizer(enum llama_vocab_type type);
1653
1654 void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
1655
1656 std::string token_to_piece_for_cache(
1657 llama_token token,
1658 bool special) const;
1659
1660
1661 std::vector<llama_token> tokenize(
1662 const std::string & raw_text,
1663 bool add_special,
1664 bool parse_special = false) const;
1665
1666 int32_t tokenize(
1667 const char * text,
1668 int32_t text_len,
1669 llama_token * tokens,
1670 int32_t n_tokens_max,
1671 bool add_special,
1672 bool parse_special) const;
1673
1674 // does not write null-terminator to buf
1675 int32_t token_to_piece(
1676 llama_token token,
1677 char * buf,
1678 int32_t length,
1679 int32_t lstrip,
1680 bool special) const;
1681
1682 // use cached data
1683 const std::string & token_to_piece(llama_token token) const;
1684
1685 int32_t detokenize(
1686 const llama_token * tokens,
1687 int32_t n_tokens,
1688 char * text,
1689 int32_t text_len_max,
1690 bool remove_special,
1691 bool unparse_special) const;
1692
1693 std::string detokenize(
1694 const std::vector<llama_token> & tokens,
1695 bool special) const;
1696
1697 void print_info() const;
1698
1699private:
1700 const llama_vocab & vocab;
1701};
1702
1703void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1704 struct gguf_context * ctx = ml.meta.get();
1705
1706 // determine vocab type
1707 {
1708 ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
1709 ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
1710
1711 ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
1712
1713 if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
1714 type = LLAMA_VOCAB_TYPE_NONE;
1715
1716 // default special tokens
1717 special_bos_id = LLAMA_TOKEN_NULL;
1718 special_eos_id = LLAMA_TOKEN_NULL;
1719 special_unk_id = LLAMA_TOKEN_NULL;
1720 special_sep_id = LLAMA_TOKEN_NULL;
1721 special_pad_id = LLAMA_TOKEN_NULL;
1722 special_mask_id = LLAMA_TOKEN_NULL;
1723 linefeed_id = LLAMA_TOKEN_NULL;
1724
1725 // read vocab size from metadata
1726 uint32_t n_tokens = 0;
1727 if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
1728 LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
1729 id_to_token.resize(n_tokens);
1730 }
1731
1732 return;
1733 }
1734
1735 if (tokenizer_model == "llama") {
1736 type = LLAMA_VOCAB_TYPE_SPM;
1737
1738 // default special tokens
1739 special_bos_id = 1;
1740 special_eos_id = 2;
1741 special_unk_id = 0;
1742 special_sep_id = LLAMA_TOKEN_NULL;
1743 special_pad_id = LLAMA_TOKEN_NULL;
1744 special_mask_id = LLAMA_TOKEN_NULL;
1745 } else if (tokenizer_model == "bert") {
1746 type = LLAMA_VOCAB_TYPE_WPM;
1747
1748 // default special tokens
1749 special_bos_id = 101;
1750 special_eos_id = LLAMA_TOKEN_NULL;
1751 special_unk_id = 100;
1752 special_sep_id = 102;
1753 special_pad_id = 0;
1754 special_mask_id = 103;
1755
1756 add_sep = true;
1757 } else if (tokenizer_model == "gpt2") {
1758 type = LLAMA_VOCAB_TYPE_BPE;
1759
1760 // read bpe merges and populate bpe ranks
1761 const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
1762 // Kimi-K2 uses custom tokenization without traditional BPE merges
1763 const bool is_kimi_k2 = (tokenizer_pre == "kimi-k2");
1764
1765 if (merges_keyidx == -1) {
1766 if (!is_kimi_k2) {
1767 throw std::runtime_error("cannot find tokenizer merges in model file\n");
1768 }
1769 // Kimi-K2 doesn't need merges, skip
1770 LLAMA_LOG_INFO("%s: Kimi-K2 tokenizer detected, skipping BPE merges\n", __func__);
1771 } else {
1772 const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
1773 for (int i = 0; i < n_merges; i++) {
1774 const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
1775 //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
1776
1777 std::string first;
1778 std::string second;
1779
1780 const size_t pos = word.find(' ', 1);
1781
1782 if (pos != std::string::npos) {
1783 first = word.substr(0, pos);
1784 second = word.substr(pos + 1);
1785 }
1786
1787 bpe_ranks.emplace(std::make_pair(first, second), i);
1788 }
1789 }
1790
1791 // default special tokens
1792 special_bos_id = 11;
1793 special_eos_id = 11;
1794 special_unk_id = LLAMA_TOKEN_NULL;
1795 special_sep_id = LLAMA_TOKEN_NULL;
1796 special_pad_id = LLAMA_TOKEN_NULL;
1797 special_mask_id = LLAMA_TOKEN_NULL;
1798 } else if (tokenizer_model == "t5") {
1799 type = LLAMA_VOCAB_TYPE_UGM;
1800
1801 // default special tokens
1802 special_bos_id = LLAMA_TOKEN_NULL;
1803 special_eos_id = 1;
1804 special_unk_id = 2;
1805 special_sep_id = LLAMA_TOKEN_NULL;
1806 special_pad_id = 0;
1807 special_mask_id = LLAMA_TOKEN_NULL;
1808
1809 const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
1810 if (precompiled_charsmap_keyidx != -1) {
1811 const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
1812 GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
1813
1814 const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1815 const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1816 precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1817#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
1818 // correct endiannes of data in precompiled_charsmap binary blob
1819 uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
1820 *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
1821 assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
1822 size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
1823 uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
1824 for (size_t i = 0; i < xcda_array_size; ++i) {
1825 xcda_array[i] = __builtin_bswap32(xcda_array[i]);
1826 }
1827#endif
1828 }
1829 } else if (tokenizer_model == "rwkv") {
1830 type = LLAMA_VOCAB_TYPE_RWKV;
1831
1832 // default special tokens
1833 special_bos_id = LLAMA_TOKEN_NULL;
1834 special_eos_id = LLAMA_TOKEN_NULL;
1835 special_unk_id = LLAMA_TOKEN_NULL;
1836 special_sep_id = LLAMA_TOKEN_NULL;
1837 special_pad_id = LLAMA_TOKEN_NULL;
1838 } else if (tokenizer_model == "plamo2") {
1839 type = LLAMA_VOCAB_TYPE_PLAMO2;
1840
1841 // PLaMo-2 default special tokens (these will be overridden by model config)
1842 special_bos_id = 1; // <|plamo:bos|>
1843 special_eos_id = 2; // <|plamo:eos|>
1844 special_unk_id = 0; // <|plamo:unk|>
1845 special_sep_id = LLAMA_TOKEN_NULL;
1846 special_pad_id = 3; // <|plamo:pad|>
1847 special_mask_id = LLAMA_TOKEN_NULL;
1848 } else {
1849 throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
1850 }
1851
1852 // for now, only BPE models have pre-tokenizers
1853 if (type == LLAMA_VOCAB_TYPE_BPE) {
1854 add_space_prefix = false;
1855 clean_spaces = true;
1856 if (tokenizer_pre.empty()) {
1857 LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
1858 LLAMA_LOG_WARN("%s: \n", __func__);
1859 LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
1860 LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
1861 LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
1862 LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
1863 LLAMA_LOG_WARN("%s: \n", __func__);
1864 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1865 } else if (tokenizer_pre == "default") {
1866 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1867 } else if (
1868 tokenizer_pre == "llama3" ||
1869 tokenizer_pre == "llama-v3" ||
1870 tokenizer_pre == "llama-bpe"||
1871 tokenizer_pre == "falcon3" ||
1872 tokenizer_pre == "falcon-h1" ||
1873 tokenizer_pre == "pixtral" ||
1874 tokenizer_pre == "midm-2.0" ||
1875 tokenizer_pre == "lfm2") {
1876 pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1877 ignore_merges = true;
1878 add_bos = true;
1879 } else if (
1880 tokenizer_pre == "deepseek-llm") {
1881 pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
1882 clean_spaces = false;
1883 } else if (
1884 tokenizer_pre == "deepseek-coder") {
1885 pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
1886 clean_spaces = false;
1887 } else if (
1888 tokenizer_pre == "deepseek-v3") {
1889 pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
1890 clean_spaces = false;
1891 } else if (
1892 tokenizer_pre == "youtu") {
1893 pre_type = LLAMA_VOCAB_PRE_TYPE_YOUTU;
1894 clean_spaces = false;
1895 ignore_merges = true;
1896 } else if (
1897 tokenizer_pre == "falcon") {
1898 pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
1899 } else if (
1900 tokenizer_pre == "mpt") {
1901 pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
1902 } else if (
1903 tokenizer_pre == "starcoder") {
1904 pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
1905 } else if (
1906 tokenizer_pre == "gpt-2" ||
1907 tokenizer_pre == "phi-2" ||
1908 tokenizer_pre == "jina-es" ||
1909 tokenizer_pre == "jina-de" ||
1910 tokenizer_pre == "gigachat" ||
1911 tokenizer_pre == "jina-v2-es" ||
1912 tokenizer_pre == "jina-v2-de" ||
1913 tokenizer_pre == "a.x-4.0" ||
1914 tokenizer_pre == "mellum" ||
1915 tokenizer_pre == "modern-bert" ) {
1916 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1917 } else if (
1918 tokenizer_pre == "jina-v1-en" ||
1919 tokenizer_pre == "jina-v2-code" ||
1920 tokenizer_pre == "roberta-bpe") {
1921 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1922 add_sep = true;
1923 } else if (
1924 tokenizer_pre == "refact") {
1925 pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
1926 } else if (
1927 tokenizer_pre == "command-r") {
1928 pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
1929 clean_spaces = false;
1930 } else if (
1931 tokenizer_pre == "qwen2" ||
1932 tokenizer_pre == "deepseek-r1-qwen" ||
1933 tokenizer_pre == "kormo") {
1934 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1935 clean_spaces = false;
1936 } else if (
1937 tokenizer_pre == "qwen35") {
1938 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN35;
1939 clean_spaces = false;
1940 } else if (
1941 tokenizer_pre == "stablelm2") {
1942 pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
1943 } else if (
1944 tokenizer_pre == "olmo") {
1945 pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
1946 } else if (
1947 tokenizer_pre == "dbrx") {
1948 pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
1949 } else if (
1950 tokenizer_pre == "smaug-bpe") {
1951 pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
1952 } else if (
1953 tokenizer_pre == "poro-chat") {
1954 pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
1955 clean_spaces = false;
1956 } else if (
1957 tokenizer_pre == "glm4" ||
1958 tokenizer_pre == "chatglm-bpe") {
1959 pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
1960 special_bos_id = LLAMA_TOKEN_NULL;
1961 } else if (
1962 tokenizer_pre == "viking") {
1963 pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
1964 clean_spaces = false;
1965 } else if (
1966 tokenizer_pre == "jais") {
1967 pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
1968 } else if (
1969 tokenizer_pre == "tekken") {
1970 pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
1971 clean_spaces = false;
1972 ignore_merges = true;
1973 add_bos = true;
1974 } else if (
1975 tokenizer_pre == "smollm") {
1976 pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
1977 clean_spaces = false;
1978 } else if (
1979 tokenizer_pre == "codeshell") {
1980 pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
1981 } else if (
1982 tokenizer_pre == "bloom") {
1983 pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
1984 } else if (
1985 tokenizer_pre == "gpt3-finnish") {
1986 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
1987 } else if (
1988 tokenizer_pre == "exaone") {
1989 pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1990 } else if (
1991 tokenizer_pre == "exaone4") {
1992 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1993 } else if (
1994 tokenizer_pre == "exaone-moe") {
1995 pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE_MOE;
1996 } else if (
1997 tokenizer_pre == "chameleon") {
1998 pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
1999 add_bos = true;
2000 clean_spaces = false;
2001 } else if (
2002 tokenizer_pre == "minerva-7b") {
2003 pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
2004 } else if (
2005 tokenizer_pre == "megrez") {
2006 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
2007 } else if (
2008 tokenizer_pre == "gpt-4o" ||
2009 tokenizer_pre == "llama4") {
2010 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
2011 clean_spaces = false;
2012 } else if (
2013 tokenizer_pre == "superbpe") {
2014 pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
2015 clean_spaces = false;
2016 } else if (
2017 tokenizer_pre == "trillion") {
2018 pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
2019 clean_spaces = false;
2020 } else if (
2021 tokenizer_pre == "granite-docling") {
2022 pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
2023 clean_spaces = false;
2024 } else if (
2025 tokenizer_pre == "bailingmoe" ||
2026 tokenizer_pre == "bailingmoe2" ||
2027 tokenizer_pre == "llada-moe") {
2028 pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
2029 clean_spaces = false;
2030 } else if (
2031 tokenizer_pre == "seed-coder") {
2032 pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
2033 clean_spaces = false;
2034 } else if (
2035 tokenizer_pre == "hunyuan") {
2036 pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
2037 clean_spaces = false;
2038 } else if (
2039 tokenizer_pre == "hunyuan-dense") {
2040 pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
2041 clean_spaces = false;
2042 } else if (
2043 tokenizer_pre == "kimi-k2") {
2044 pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
2045 clean_spaces = false;
2046 } else if (
2047 tokenizer_pre == "grok-2") {
2048 pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2;
2049 clean_spaces = false;
2050 } else if (
2051 tokenizer_pre == "afmoe") {
2052 pre_type = LLAMA_VOCAB_PRE_TYPE_AFMOE;
2053 clean_spaces = false;
2054 } else if (
2055 tokenizer_pre == "minimax-m2") {
2056 pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2;
2057 clean_spaces = false;
2058 } else if (
2059 tokenizer_pre == "solar-open") {
2060 pre_type = LLAMA_VOCAB_PRE_TYPE_SOLAR_OPEN;
2061 clean_spaces = false;
2062 } else {
2063 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
2064 }
2065 } else if (type == LLAMA_VOCAB_TYPE_SPM) {
2066 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2067 add_space_prefix = true;
2068 clean_spaces = false;
2069 add_bos = true;
2070 add_eos = false;
2071 } else if (type == LLAMA_VOCAB_TYPE_WPM) {
2072 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2073 add_space_prefix = false;
2074 clean_spaces = true;
2075 add_bos = true;
2076 add_eos = false;
2077 add_sep = true;
2078 } else if (type == LLAMA_VOCAB_TYPE_UGM) {
2079 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2080 add_bos = false;
2081 add_eos = true;
2082 } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
2083 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2084 add_space_prefix = false;
2085 clean_spaces = false;
2086 add_bos = false;
2087 add_eos = false;
2088 } else {
2089 pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2090 }
2091
2092 ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
2093 ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
2094 }
2095
2096 const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
2097 if (token_idx == -1) {
2098 throw std::runtime_error("cannot find tokenizer vocab in model file\n");
2099 }
2100
2101 const float * scores = nullptr;
2102 const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
2103 if (score_idx != -1) {
2104 scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
2105 }
2106
2107 const int * toktypes = nullptr;
2108 const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
2109 if (toktype_idx != -1) {
2110 toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
2111 }
2112
2113 uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
2114 id_to_token.resize(n_tokens);
2115
2116 for (uint32_t i = 0; i < n_tokens; i++) {
2117 std::string word = gguf_get_arr_str(ctx, token_idx, i);
2118 if (word.empty()) {
2119 LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
2120 word = "[EMPTY_" + std::to_string(i) + "]";
2121 }
2122
2123 token_to_id[word] = i;
2124 max_token_len = std::max(max_token_len, (int) word.size());
2125
2126 auto & token_data = id_to_token[i];
2127 token_data.text = std::move(word);
2128 token_data.score = scores ? scores[i] : 0.0f;
2129 token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
2130
2131 if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
2132 switch(toktypes[i]) {
2133 case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
2134 case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
2135 case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
2136 case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
2137 case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
2138 case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
2139 case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
2140 default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
2141 }
2142 }
2143 }
2144 GGML_ASSERT(id_to_token.size() == token_to_id.size());
2145
2146 init_tokenizer(type);
2147
2148 // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
2149 if (type == LLAMA_VOCAB_TYPE_SPM) {
2150 try {
2151 linefeed_id = vocab.byte_to_token('\n');
2152 } catch (const std::exception & e) {
2153 LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
2154 linefeed_id = special_pad_id;
2155 }
2156 } else if (type == LLAMA_VOCAB_TYPE_WPM) {
2157 linefeed_id = special_pad_id;
2158 } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
2159 const std::vector<int> ids = tokenize("\n", false);
2160 GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2161 linefeed_id = ids[0];
2162 } else {
2163 const std::vector<int> ids = tokenize("\n", false);
2164
2165 //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2166 if (ids.empty()) {
2167 LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
2168 linefeed_id = special_pad_id;
2169 } else {
2170 linefeed_id = ids[0];
2171 }
2172 }
2173
2174 // special tokens
2175 {
2176 const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
2177 { LLM_KV_TOKENIZER_BOS_ID, special_bos_id },
2178 { LLM_KV_TOKENIZER_EOS_ID, special_eos_id },
2179 { LLM_KV_TOKENIZER_EOT_ID, special_eot_id },
2180 { LLM_KV_TOKENIZER_EOM_ID, special_eom_id },
2181 { LLM_KV_TOKENIZER_UNK_ID, special_unk_id },
2182 { LLM_KV_TOKENIZER_SEP_ID, special_sep_id },
2183 { LLM_KV_TOKENIZER_PAD_ID, special_pad_id },
2184 { LLM_KV_TOKENIZER_MASK_ID, special_mask_id },
2185 { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
2186 { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
2187 { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
2188 { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
2189 { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
2190 { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
2191
2192 // deprecated
2193 { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
2194 { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
2195 { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
2196 };
2197
2198 for (const auto & it : special_token_types) {
2199 const std::string & key = kv(std::get<0>(it));
2200 int32_t & id = std::get<1>(it);
2201
2202 uint32_t new_id;
2203 if (!ml.get_key(std::get<0>(it), new_id, false)) {
2204 continue;
2205 }
2206 if (new_id >= id_to_token.size()) {
2207 LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
2208 __func__, key.c_str(), new_id, id);
2209 } else {
2210 id = new_id;
2211 }
2212 }
2213
2214 // Handle add_bos, add_eos and add_sep
2215 {
2216 bool temp = true;
2217
2218 if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
2219 add_bos = temp;
2220 }
2221 if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
2222 add_eos = temp;
2223 }
2224 if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
2225 add_sep = temp;
2226 }
2227 }
2228
2229 // auto-detect special tokens by text
2230 // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
2231 // for now, we apply this workaround to find the tokens based on their text
2232
2233 for (const auto & t : token_to_id) {
2234 auto & attr = id_to_token[t.second].attr;
2235
2236 // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
2237 if (special_eot_id == LLAMA_TOKEN_NULL) {
2238 if (false
2239 || t.first == "<|eot_id|>"
2240 || t.first == "<|im_end|>"
2241 || t.first == "<|end|>"
2242 || t.first == "<end_of_turn>"
2243 || t.first == "<|endoftext|>"
2244 || t.first == "<|end_of_text|>" // granite
2245 || t.first == "<EOT>"
2246 || t.first == "_<EOT>"
2247 || t.first == "[EOT]" // Kimi-K2
2248 || t.first == "<|end▁of▁sentence|>" // DeepSeek
2249 || t.first == "<end_of_utterance>" // smoldocling
2250 ) {
2251 special_eot_id = t.second;
2252 if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2253 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2254 __func__, t.second, t.first.c_str());
2255 attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2256 }
2257 }
2258 }
2259
2260 // find EOM token: "<|eom_id|>"
2261 if (special_eom_id == LLAMA_TOKEN_NULL) {
2262 if (false
2263 || t.first == "<|eom_id|>"
2264 ) {
2265 special_eom_id = t.second;
2266 if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2267 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2268 __func__, t.second, t.first.c_str());
2269 attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2270 }
2271 }
2272 }
2273
2274 // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
2275 if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
2276 if (false
2277 || t.first == "<|fim_prefix|>" // Qwen
2278 || t.first == "<fim-prefix>"
2279 || t.first == "<fim_prefix>" // Granite
2280 || t.first == "<|fim▁begin|>" // DeepSeek
2281 || t.first == "<PRE>"
2282 || t.first == "▁<PRE>" // CodeLlama
2283 || t.first == "<|code_prefix|>" // GLM-4.5
2284 || t.first == "<|prefix|>" // Falcon-H1-Tiny-Coder
2285 ) {
2286 special_fim_pre_id = t.second;
2287 if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2288 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2289 __func__, t.second, t.first.c_str());
2290 attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2291 }
2292 }
2293 }
2294
2295 // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
2296 if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
2297 if (false
2298 || t.first == "<|fim_suffix|>" // Qwen
2299 || t.first == "<fim-suffix>"
2300 || t.first == "<fim_suffix>" // Granite
2301 || t.first == "<|fim▁hole|>" // DeepSeek
2302 || t.first == "<SUF>"
2303 || t.first == "▁<SUF>" // CodeLlama
2304 || t.first == "<|code_suffix|>" // GLM-4.5
2305 || t.first == "<|suffix|>" // Falcon-H1-Tiny-Coder
2306 ) {
2307 special_fim_suf_id = t.second;
2308 if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2309 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2310 __func__, t.second, t.first.c_str());
2311 attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2312 }
2313 }
2314 }
2315
2316 // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
2317 if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
2318 if (false
2319 || t.first == "<|fim_middle|>" // Qwen
2320 || t.first == "<fim-middle>"
2321 || t.first == "<fim_middle>" // Granite
2322 || t.first == "<|fim▁end|>" // DeepSeek
2323 || t.first == "<MID>"
2324 || t.first == "▁<MID>" // CodeLlama
2325 || t.first == "<|code_middle|>" // GLM-4.5
2326 || t.first == "<|middle|>" // Falcon-H1-Tiny-Coder
2327 ) {
2328 special_fim_mid_id = t.second;
2329 if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2330 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2331 __func__, t.second, t.first.c_str());
2332 attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2333 }
2334 }
2335 }
2336
2337 // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
2338 if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
2339 if (false
2340 || t.first == "<|fim_pad|>" // Qwen
2341 || t.first == "<fim-pad>"
2342 || t.first == "<fim_pad>" // Granite
2343 || t.first == "<PAD>"
2344 || t.first == "[PAD]" // Kimi-K2
2345 ) {
2346 special_fim_pad_id = t.second;
2347 if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2348 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2349 __func__, t.second, t.first.c_str());
2350 attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2351 }
2352 }
2353 }
2354
2355 // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
2356 if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
2357 if (false
2358 || t.first == "<|fim_repo|>" // Qwen
2359 || t.first == "<|repo_name|>"
2360 || t.first == "<fim-repo>"
2361 || t.first == "<REPO>"
2362 || t.first == "<reponame>" // Granite
2363 ) {
2364 special_fim_rep_id = t.second;
2365 if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2366 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2367 __func__, t.second, t.first.c_str());
2368 attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2369 }
2370 }
2371 }
2372
2373 // find FIM_SEP token: "<|file_sep|>"
2374 if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
2375 if (false
2376 || t.first == "<|file_sep|>" // Qwen
2377 ) {
2378 special_fim_sep_id = t.second;
2379 if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2380 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2381 __func__, t.second, t.first.c_str());
2382 attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2383 }
2384 }
2385 }
2386 }
2387
2388 // auto-detect unused tokens: e.g. control tokens with the word "unused"
2389 // ideally, these tokens should be marked as unused during conversion
2390 {
2391 uint32_t n_unused = 0;
2392
2393 for (const auto & t : token_to_id) {
2394 auto & attr = id_to_token[t.second].attr;
2395
2396 if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2397 continue;
2398 }
2399
2400 if ((attr & LLAMA_TOKEN_ATTR_UNUSED) == 0) {
2401 if (strstr(t.first.c_str(), "unused") != NULL) {
2402 attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_UNUSED);
2403 }
2404 }
2405
2406 if (attr & LLAMA_TOKEN_ATTR_UNUSED) {
2407 n_unused++;
2408 }
2409 }
2410
2411 LLAMA_LOG_INFO("%s: %u unused tokens\n", __func__, n_unused);
2412 }
2413
2414 // maintain a list of tokens that cause end-of-generation
2415 // this is currently determined based on the token text, which is obviously not ideal
2416 // ref: https://github.com/ggml-org/llama.cpp/issues/9606
2417 special_eog_ids.clear();
2418
2419 if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
2420 special_eog_ids.insert(special_fim_pad_id);
2421 }
2422
2423 if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
2424 special_eog_ids.insert(special_fim_rep_id);
2425 }
2426
2427 if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
2428 special_eog_ids.insert(special_fim_sep_id);
2429 }
2430
2431 for (const auto & t : token_to_id) {
2432 auto & attr = id_to_token[t.second].attr;
2433
2434 if (false
2435 || t.first == "<|eot_id|>"
2436 || t.first == "<|im_end|>"
2437 || t.first == "<|end|>"
2438 || t.first == "<|return|>" // o200k_harmony
2439 || t.first == "<|call|>" // o200k_harmony
2440 || t.first == "<|flush|>" // solar-open
2441 || t.first == "<|calls|>" // solar-open
2442 || t.first == "<end_of_turn>"
2443 || t.first == "<|endoftext|>"
2444 || t.first == "<|eom_id|>"
2445 || t.first == "<EOT>"
2446 || t.first == "_<EOT>"
2447 || t.first == "[EOT]" // Kimi-K2
2448 || t.first == "[EOS]" // Kimi-K2
2449 || t.first == "<|end_of_text|>"
2450 || t.first == "<end_of_utterance>" // smoldocling
2451 ) {
2452 special_eog_ids.insert(t.second);
2453 if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2454 LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2455 __func__, t.second, t.first.c_str());
2456 attr = (llama_token_attr) (attr | LLAMA_TOKEN_ATTR_CONTROL);
2457 }
2458 } else {
2459 if (attr & LLAMA_TOKEN_ATTR_CONTROL && !(attr & LLAMA_TOKEN_ATTR_UNUSED)) {
2460 // token is control, but not marked as EOG -> print a debug log
2461 if (special_eog_ids.count(t.second) == 0) {
2462 LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
2463 __func__, t.second, t.first.c_str());
2464 }
2465 }
2466 }
2467 }
2468
2469 // @ngxson : quick hack for gpt-oss, always render these tokens
2470 for (const auto & t : token_to_id) {
2471 auto & attr = id_to_token[t.second].attr;
2472
2473 if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2474 LLAMA_LOG_WARN("%s: setting token '%s' (%d) attribute to USER_DEFINED (%u), old attributes: %u\n",
2475 __func__, t.first.c_str(), t.second, LLAMA_TOKEN_ATTR_USER_DEFINED, attr);
2476
2477 attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2478 }
2479 }
2480
2481 // sanity checks
2482 if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
2483 special_eog_ids.insert(special_eos_id);
2484 LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2485 }
2486
2487 if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
2488 special_eog_ids.insert(special_eot_id);
2489 LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2490 }
2491
2492 if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
2493 special_eog_ids.insert(special_eom_id);
2494 LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2495 }
2496
2497 // TODO: workaround for o200k_harmony and solar-open tokenizer: the "<|end|>" token should not be EOG
2498 // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens ("<|calls|>" and "<|flush|>" for solar-open),
2499 // we remove the "<|end|>" token from the EOG list
2500 {
2501 bool has_return = false;
2502 bool has_call = false;
2503 bool has_end = false;
2504 bool has_flush = false;
2505
2506 llama_token end_id = LLAMA_TOKEN_NULL;
2507
2508 LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2509 for (auto tid : special_eog_ids) {
2510 auto & text = id_to_token[tid].text;
2511
2512 LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, text.c_str());
2513
2514 if (text == "<|return|>") {
2515 has_return = true;
2516 } else if (text == "<|call|>" || text == "<|calls|>") {
2517 has_call = true;
2518 } else if (text == "<|flush|>") {
2519 has_flush = true;
2520 } else if (text == "<|end|>") {
2521 has_end = true;
2522 end_id = tid;
2523 }
2524 }
2525
2526 if ((has_return && has_call && has_end) || (has_call && has_flush && has_end)) {
2527 special_eog_ids.erase(end_id);
2528
2529 auto & attr = id_to_token[end_id].attr;
2530 attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2531
2532 LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>', or '<|calls|>' and '<|flush|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2533 }
2534 }
2535 }
2536
2537 // build special tokens cache
2538 {
2539 for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
2540 if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
2541 cache_special_tokens.push_back(id);
2542 }
2543 }
2544
2545 std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
2546 [&] (const llama_token a, const llama_token b) {
2547 return id_to_token[a].text.size() > id_to_token[b].text.size();
2548 }
2549 );
2550
2551 LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
2552 }
2553
2554 // build token to piece cache
2555 {
2556 size_t size_cache = 0;
2557
2558 std::vector<std::string> cache(n_tokens);
2559
2560 for (uint32_t id = 0; id < n_tokens; ++id) {
2561 cache[id] = token_to_piece_for_cache(id, true);
2562
2563 size_cache += cache[id].size();
2564 }
2565
2566 std::swap(cache_token_to_piece, cache);
2567
2568 LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
2569 }
2570
2571 // Handle per token attributes
2572 //NOTE: Each model customizes per token attributes.
2573 //NOTE: Per token attributes are missing from the GGUF file.
2574 //TODO: Extract attributes from GGUF file.
2575 {
2576 auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
2577 for (const auto & substr : substrs) {
2578 if (str.find(substr) != std::string::npos) {
2579 return true;
2580 }
2581 }
2582 return false;
2583 };
2584
2585 auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
2586 uint32_t current = id_to_token.at(id).attr;
2587 current = value ? (current | attr) : (current & ~attr);
2588 id_to_token[id].attr = (llama_token_attr) current;
2589 };
2590
2591 auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
2592 _set_tokenid_attr(token_to_id.at(token), attr, value);
2593 };
2594
2595 std::string model_name;
2596 std::string tokenizer_pre;
2597 std::string general_arch;
2598
2599 ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
2600 ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
2601 ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
2602
2603 // model name to lowercase
2604 std::transform(model_name.begin(), model_name.end(), model_name.begin(),
2605 [] (const std::string::value_type x) {
2606 return std::tolower(x);
2607 }
2608 );
2609
2610 // set attributes by model/tokenizer/architecture name
2611 if (false
2612 || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2613 || _contains_any(general_arch, {"nomic-bert-moe", "jina-bert-v3"})
2614 ) {
2615 if (token_to_id.count("<mask>") == 0) {
2616 LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
2617 } else {
2618 _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2619 }
2620 } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
2621 for (auto id : cache_special_tokens) {
2622 _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
2623 }
2624 for (const auto * token : {"</s>"}) {
2625 _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
2626 }
2627 for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
2628 _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
2629 }
2630 } else if (_contains_any(model_name, {"modern-bert"})) {
2631 if (token_to_id.count("[MASK]") == 0 ) {
2632 LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
2633 }
2634 else {
2635 _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
2636 }
2637 }
2638 }
2639}
2640
2641enum llama_vocab_type llama_vocab::impl::get_type() const {
2642 return type;
2643}
2644
2645std::string llama_vocab::impl::type_name() const{
2646 switch (type) {
2647 case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2648 case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2649 case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2650 case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2651 case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2652 case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2653 case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
2654 default: return "unknown";
2655 }
2656}
2657
2658bool llama_vocab::impl::is_normal(llama_token id) const {
2659 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2660 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
2661}
2662
2663bool llama_vocab::impl::is_unknown(llama_token id) const {
2664 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2665 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
2666}
2667
2668bool llama_vocab::impl::is_control(llama_token id) const {
2669 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2670 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
2671}
2672
2673bool llama_vocab::impl::is_byte(llama_token id) const {
2674 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2675 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
2676}
2677
2678bool llama_vocab::impl::is_user_defined(llama_token id) const {
2679 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2680 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
2681}
2682
2683bool llama_vocab::impl::is_unused(llama_token id) const {
2684 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2685 return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
2686}
2687
2688bool llama_vocab::impl::is_eog(llama_token id) const {
2689 return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
2690}
2691
2692uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
2693 GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
2694 GGML_ASSERT(is_byte(id));
2695 const auto & token_data = id_to_token.at(id);
2696 switch (get_type()) {
2697 case LLAMA_VOCAB_TYPE_SPM:
2698 case LLAMA_VOCAB_TYPE_UGM: {
2699 auto buf = token_data.text.substr(3, 2);
2700 return strtol(buf.c_str(), NULL, 16);
2701 }
2702 case LLAMA_VOCAB_TYPE_BPE: {
2703 GGML_ABORT("fatal error");
2704 }
2705 case LLAMA_VOCAB_TYPE_WPM: {
2706 GGML_ABORT("fatal error");
2707 }
2708 default:
2709 GGML_ABORT("fatal error");
2710 }
2711}
2712
2713llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
2714 GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2715 return id_to_token.at(id).attr;
2716}
2717
2718void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
2719 LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
2720
2721 switch (type) {
2722 case LLAMA_VOCAB_TYPE_SPM:
2723 tokenizer = std::make_unique<llm_tokenizer_spm>(vocab);
2724 break;
2725 case LLAMA_VOCAB_TYPE_BPE:
2726 tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
2727 break;
2728 case LLAMA_VOCAB_TYPE_WPM:
2729 tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
2730 break;
2731 case LLAMA_VOCAB_TYPE_UGM:
2732 tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
2733 break;
2734 case LLAMA_VOCAB_TYPE_RWKV:
2735 tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
2736 break;
2737 case LLAMA_VOCAB_TYPE_PLAMO2:
2738 tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
2739 break;
2740 default:
2741 GGML_ABORT("unsupported vocab type");
2742 }
2743}
2744
2745//
2746// (de-) tokenize
2747//
2748
2749// #define PRETOKENIZERDEBUG
2750
2751void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
2752 // for each special token
2753 for (const llama_token special_id : cache_special_tokens) {
2754 const auto & data = vocab.get_token_data(special_id);
2755 const auto & text = data.text;
2756
2757 if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
2758 // Ignore control and unknown tokens when parse_special == false
2759 continue;
2760 // User-defined tokens are still pre-tokenized before everything else
2761 // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
2762 // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
2763 }
2764
2765 // for each text fragment
2766 std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
2767 while (it != buffer.end()) {
2768 auto & fragment = (*it);
2769
2770 // if a fragment is text ( not yet processed )
2771 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2772 const auto & raw_text = fragment.raw_text;
2773
2774 auto raw_text_base_offset = fragment.offset;
2775 auto raw_text_base_length = fragment.length;
2776
2777 // loop over the text
2778 while (true) {
2779 // find the first occurrence of a given special token in this fragment
2780 // passing offset argument only limit the "search area" but match coordinates
2781 // are still relative to the source full raw_text
2782 // string_view begins at pos 0 for the same reason
2783 auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
2784
2785 // no occurrences found, stop processing this fragment for a given special token
2786 if (match == std::string::npos) break;
2787
2788#ifdef PRETOKENIZERDEBUG
2789 LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2790#endif
2791 auto source = std::distance(buffer.begin(), it);
2792
2793 // if match is further than base offset
2794 // then we have some text to the left of it
2795 if (match > raw_text_base_offset) {
2796 // left
2797 const int64_t left_reminder_offset = raw_text_base_offset + 0;
2798 int64_t left_reminder_length = match - raw_text_base_offset;
2799
2800 if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
2801 while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
2802 left_reminder_length--;
2803 }
2804 }
2805
2806 if (left_reminder_length > 0) {
2807 buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
2808 it++;
2809 }
2810
2811#ifdef PRETOKENIZERDEBUG
2812 LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
2813#endif
2814 }
2815
2816 // special token
2817 buffer.emplace_after(it, special_id);
2818 it++;
2819
2820 // right
2821 if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
2822 int64_t right_reminder_offset = match + text.length();
2823 int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
2824
2825 if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
2826 while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
2827 right_reminder_offset++;
2828 right_reminder_length--;
2829 }
2830 }
2831
2832 if (right_reminder_length > 0) {
2833 buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
2834 it++;
2835 }
2836
2837#ifdef PRETOKENIZERDEBUG
2838 LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
2839#endif
2840
2841 if (source == 0) {
2842 buffer.erase_after(buffer.before_begin());
2843 } else {
2844 buffer.erase_after(std::next(buffer.begin(), (source - 1)));
2845 }
2846
2847 // repeat for the right side
2848 raw_text_base_offset = right_reminder_offset;
2849 raw_text_base_length = right_reminder_length;
2850
2851#ifdef PRETOKENIZERDEBUG
2852 LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2853#endif
2854 } else {
2855 if (source == 0) {
2856 buffer.erase_after(buffer.before_begin());
2857 } else {
2858 buffer.erase_after(std::next(buffer.begin(), (source - 1)));
2859 }
2860 break;
2861 }
2862 }
2863 }
2864 it++;
2865 }
2866 }
2867}
2868
2869// NOTE: avoid ever using this except for building the token_to_piece caches
2870std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
2871 std::string piece;
2872 piece.resize(piece.capacity()); // using string internal cache
2873 const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
2874 if (n_chars < 0) {
2875 piece.resize(-n_chars);
2876 int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
2877 GGML_ASSERT(check == -n_chars);
2878 }
2879 else {
2880 piece.resize(n_chars);
2881 }
2882
2883 return piece;
2884}
2885
2886static void llama_escape_whitespace(std::string & text) {
2887 replace_all(text, " ", "\xe2\x96\x81");
2888}
2889
2890static void llama_unescape_whitespace(std::string & word) {
2891 replace_all(word, "\xe2\x96\x81", " ");
2892}
2893
2894static std::string llama_decode_text(const std::string & text) {
2895 std::string decoded_text;
2896
2897 const auto cpts = unicode_cpts_from_utf8(text);
2898 for (const auto cpt : cpts) {
2899 const auto utf8 = unicode_cpt_to_utf8(cpt);
2900 try {
2901 decoded_text += unicode_utf8_to_byte(utf8);
2902 } catch (const std::out_of_range & /*e*/) {
2903 decoded_text += "[UNK_BYTE_0x";
2904 for (const auto c : utf8) {
2905 decoded_text += format("%02x", (uint8_t) c);
2906 }
2907 decoded_text += text + "]";
2908 }
2909 }
2910
2911 return decoded_text;
2912}
2913
2914std::vector<llama_token> llama_vocab::impl::tokenize(
2915 const std::string & raw_text,
2916 bool add_special,
2917 bool parse_special) const {
2918 GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
2919
2920 std::vector<llama_token> output;
2921 std::forward_list<fragment_buffer_variant> fragment_buffer;
2922
2923 if (!raw_text.empty()) {
2924 fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
2925 tokenizer_st_partition(fragment_buffer, parse_special);
2926 }
2927
2928 switch (get_type()) {
2929 case LLAMA_VOCAB_TYPE_SPM:
2930 {
2931 // OG tokenizer behavior:
2932 //
2933 // tokenizer.encode('', add_special_tokens=True) returns [1]
2934 // tokenizer.encode('', add_special_tokens=False) returns []
2935
2936 bool is_prev_special = true; // prefix with space if first token
2937
2938 if (add_special && add_bos) {
2939 GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2940 output.push_back(special_bos_id);
2941 is_prev_special = true;
2942 }
2943
2944 for (const auto & fragment : fragment_buffer) {
2945 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2946 std::string text;
2947
2948 // prefix with space if previous is special
2949 if (add_space_prefix && is_prev_special) {
2950 text = ' ';
2951 }
2952
2953 text += fragment.raw_text.substr(fragment.offset, fragment.length);
2954
2955#ifdef PRETOKENIZERDEBUG
2956 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2957#endif
2958 llama_escape_whitespace(text);
2959 llm_tokenizer_spm_session session(vocab);
2960 session.tokenize(text, output);
2961 is_prev_special = false;
2962 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2963 output.push_back(fragment.token);
2964 is_prev_special = true;
2965 }
2966 }
2967
2968 if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
2969 LLAMA_LOG_WARN(
2970 "%s: Added a BOS token to the prompt as specified by the model but the prompt "
2971 "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
2972 "Are you sure this is what you want?\n", __FUNCTION__);
2973 }
2974
2975 if (add_special && add_eos) {
2976 GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
2977 output.push_back(special_eos_id);
2978 }
2979 } break;
2980 case LLAMA_VOCAB_TYPE_BPE:
2981 {
2982 llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
2983 // it calls some other methods that are not exist in llm_tokenizer,
2984 // here just cast it to bpe tokenizer object
2985 if (add_special) {
2986 session.append_bos(output);
2987 }
2988 for (const auto & fragment : fragment_buffer) {
2989 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2990 std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2991
2992#ifdef PRETOKENIZERDEBUG
2993 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2994#endif
2995 session.tokenize(text, output);
2996 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2997 session.append(fragment.token, output);
2998 }
2999 }
3000
3001 if (add_special) {
3002 session.append_eos(output);
3003 session.check_double_bos_eos(output);
3004 }
3005 } break;
3006 case LLAMA_VOCAB_TYPE_WPM:
3007 {
3008 if (add_special) {
3009 GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
3010 output.push_back(special_bos_id);
3011 }
3012
3013 llm_tokenizer_wpm_session session(vocab);
3014
3015 for (const auto & fragment : fragment_buffer) {
3016 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
3017 std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
3018
3019#ifdef PRETOKENIZERDEBUG
3020 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
3021#endif
3022 session.tokenize(text, output);
3023 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
3024 output.push_back(fragment.token);
3025 }
3026 }
3027
3028 if (add_special) {
3029 GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
3030 output.push_back(special_sep_id);
3031 }
3032 } break;
3033 case LLAMA_VOCAB_TYPE_UGM:
3034 {
3035 if (add_special && add_bos) {
3036 GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
3037 output.push_back(special_bos_id);
3038 }
3039 llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
3040
3041 for (const auto & fragment : fragment_buffer) {
3042 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
3043 std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
3044#ifdef PRETOKENIZERDEBUG
3045 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
3046#endif
3047 session.tokenize(text, output);
3048 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
3049 output.push_back(fragment.token);
3050 }
3051 }
3052
3053 if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
3054 LLAMA_LOG_WARN(
3055 "%s: Added a BOS token to the prompt as specified by the model but the prompt "
3056 "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
3057 "Are you sure this is what you want?\n", __FUNCTION__);
3058 }
3059
3060 if (add_special && add_eos) {
3061 GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
3062 output.push_back(special_eos_id);
3063 }
3064 } break;
3065 case LLAMA_VOCAB_TYPE_RWKV:
3066 {
3067 llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
3068 for (const auto & fragment : fragment_buffer) {
3069 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
3070 std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
3071
3072#ifdef PRETOKENIZERDEBUG
3073 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
3074#endif
3075
3076 session.tokenize(text, output);
3077 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
3078 output.push_back(fragment.token);
3079 }
3080 }
3081 } break;
3082 case LLAMA_VOCAB_TYPE_PLAMO2:
3083 {
3084 llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
3085 for (const auto & fragment : fragment_buffer) {
3086 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
3087 std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
3088
3089#ifdef PRETOKENIZERDEBUG
3090 LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
3091#endif
3092
3093 session.tokenize(text, output);
3094 } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
3095 output.push_back(fragment.token);
3096 }
3097 }
3098 } break;
3099 case LLAMA_VOCAB_TYPE_NONE:
3100 GGML_ABORT("fatal error");
3101 }
3102
3103 return output;
3104}
3105
3106int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
3107 // ref: https://github.com/ggml-org/llama.cpp/pull/7587#discussion_r1620983843
3108 static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
3109 const llama_token_attr attr = token_get_attr(token);
3110 if (!special && (attr & attr_special)) {
3111 return 0;
3112 }
3113
3114 // copy piece chars to output text buffer
3115 // skip up to 'lstrip' leading spaces before copying
3116 auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
3117 if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3118 GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
3119 }
3120
3121 for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
3122 token++;
3123 size--;
3124 }
3125 if (length < (int32_t)size) {
3126 return -(int32_t) size;
3127 }
3128 memcpy(buf, token, size);
3129 return (int32_t) size;
3130 };
3131
3132 // if we have a cache - use it
3133 {
3134 const auto & cache = cache_token_to_piece;
3135
3136 if (!cache.empty()) {
3137 const auto & result = cache.at(token);
3138 return _try_copy(result.data(), result.size());
3139 }
3140 }
3141
3142 if (0 <= token && token < (int32_t) id_to_token.size()) {
3143 const std::string & token_text = id_to_token[token].text;
3144 switch (get_type()) {
3145 case LLAMA_VOCAB_TYPE_WPM:
3146 case LLAMA_VOCAB_TYPE_SPM:
3147 case LLAMA_VOCAB_TYPE_UGM: {
3148 // NOTE: we accept all unsupported token types,
3149 // suppressing them like CONTROL tokens.
3150 if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3151 return _try_copy(token_text.data(), token_text.size());
3152 }
3153 if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3154 std::string result = token_text;
3155 llama_unescape_whitespace(result);
3156 return _try_copy(result.data(), result.size());
3157 }
3158 if (attr & LLAMA_TOKEN_ATTR_BYTE) {
3159 char byte = (char) token_to_byte(token);
3160 return _try_copy((char*) &byte, 1);
3161 }
3162 break;
3163 }
3164 case LLAMA_VOCAB_TYPE_BPE: {
3165 // NOTE: we accept all unsupported token types,
3166 // suppressing them like CONTROL tokens.
3167 if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3168 return _try_copy(token_text.data(), token_text.size());
3169 }
3170 if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3171 std::string result = llama_decode_text(token_text);
3172 return _try_copy(result.data(), result.size());
3173 }
3174 break;
3175 }
3176 case LLAMA_VOCAB_TYPE_RWKV: {
3177 std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
3178
3179 // If we don't have enough space, return an error
3180 if (result.size() > (size_t)length) {
3181 return -(int)result.size();
3182 }
3183
3184 memcpy(buf, result.data(), result.size());
3185 return (int)result.size();
3186 }
3187 case LLAMA_VOCAB_TYPE_PLAMO2: {
3188 // PLaMo-2 uses similar token handling as BPE/SPM
3189 if (vocab.is_byte(token)) {
3190 // Handle byte tokens like <0xXX>
3191 if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
3192 int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
3193 if (length < 1) {
3194 return -1;
3195 }
3196 buf[0] = static_cast<char>(hex_val);
3197 return 1;
3198 }
3199 }
3200
3201 // Normal token - just copy the text
3202 std::string result = token_text;
3203 return _try_copy(result.data(), result.size());
3204 }
3205 default:
3206 GGML_ABORT("fatal error");
3207 }
3208 }
3209
3210 return 0;
3211}
3212
3213const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
3214 return cache_token_to_piece.at(token);
3215}
3216
3217int32_t llama_vocab::impl::detokenize(
3218 const llama_token * tokens,
3219 int32_t n_tokens,
3220 char * text,
3221 int32_t text_len_max,
3222 bool remove_special,
3223 bool unparse_special) const {
3224 if (type == LLAMA_VOCAB_TYPE_NONE) {
3225 return 0;
3226 }
3227
3228 GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
3229
3230 int32_t avail = text_len_max;
3231 int32_t total = 0;
3232
3233 // remove the leading space
3234 bool remove_space = add_space_prefix;
3235
3236 if (remove_special && add_bos) {
3237 if (n_tokens > 0 && tokens[0] == special_bos_id) {
3238 remove_space = false;
3239 n_tokens--;
3240 tokens++;
3241 }
3242 }
3243
3244 if (remove_special && add_eos) {
3245 if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
3246 n_tokens--;
3247 }
3248 }
3249
3250 for (int32_t i = 0; i < n_tokens; ++i) {
3251 GGML_ASSERT(avail >= 0);
3252 int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special);
3253 remove_space = false;
3254 if (n_chars < 0) {
3255 avail = 0;
3256 total -= n_chars;
3257 } else if (n_chars > 0) {
3258 avail -= n_chars;
3259 text += n_chars;
3260 total += n_chars;
3261 }
3262 }
3263
3264 if (total > text_len_max) {
3265 return -total;
3266 }
3267
3268 if (clean_spaces) {
3269 text -= total; // restart text
3270
3271 // first pass: characters ?!., //TODO: where do these characters come from?
3272 const int32_t total1 = total;
3273 total = total ? 1 : 0;
3274 for (int32_t i = 1; i < total1; ++i) {
3275 const char x = text[i];
3276 if (text[i - 1] == ' ') {
3277 if (x == '?' || x == '!' || x == '.' || x == ',') { // " ?", " !", " .", " ,"
3278 total--; // remove space
3279 }
3280 }
3281 text[total++] = x;
3282 }
3283
3284 // second pass: strip single apostrophe between spaces
3285 const int32_t total2 = total;
3286 total = total ? 1 : 0;
3287 for (int32_t i = 1; i < total2; ++i) {
3288 const char x = text[i];
3289 if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') { // " ' "
3290 total--; // remove prev space
3291 text[++i] = '\0'; // remove next space
3292 }
3293 text[total++] = x;
3294 }
3295
3296 // third pass: apostrophe contractions //NOTE: this makes sense?
3297 const int32_t total3 = total;
3298 total = total ? 1 : 0;
3299 for (int32_t i = 1; i < total3; ++i) {
3300 const char x = text[i];
3301 if (text[i - 1] == ' ') {
3302 if (x == '\'' && i + 1 < total3) {
3303 const char x1 = text[i + 1];
3304 if (x1 == 't' || x1 == 'd') { // " 't", " 'd"
3305 //total--; // remove space
3306 } else if (x1 == 's' || x1 == 'm') { // " 's", " 'm"
3307 total--; // remove space
3308 } else if (i + 2 < total3) {
3309 const char x2 = text[i + 2];
3310 if ((x1 == 'l' && x2 == 'l')) { // " 'll"
3311 //total--; // remove space
3312 } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) { // " 're", " 've"
3313 total--; // remove space
3314 } else {
3315 //total--; // remove space
3316 }
3317 } else {
3318 //total--; // remove space
3319 }
3320 }
3321 }
3322 text[total++] = x;
3323 }
3324 }
3325
3326 return total <= text_len_max ? total : -total;
3327}
3328
3329void llama_vocab::impl::print_info() const {
3330 LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
3331 LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
3332 LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
3333
3334 // special tokens
3335 if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
3336 if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
3337 if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
3338 if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
3339 if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
3340 if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
3341 if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
3342 if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
3343
3344 if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
3345
3346 if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
3347 if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
3348 if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
3349 if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
3350 if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
3351 if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
3352
3353 for (const auto & id : special_eog_ids) {
3354 LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
3355 }
3356
3357 LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
3358}
3359
3360llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
3361}
3362
3363llama_vocab::~llama_vocab() = default;
3364
3365void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
3366 pimpl->load(ml, kv);
3367}
3368
3369std::string llama_vocab::get_tokenizer_model() const {
3370 return pimpl->tokenizer_model;
3371}
3372
3373std::string llama_vocab::get_tokenizer_pre() const {
3374 return pimpl->tokenizer_pre;
3375}
3376
3377enum llama_vocab_type llama_vocab::get_type() const {
3378 return pimpl->type;
3379}
3380
3381enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
3382 return pimpl->pre_type;
3383}
3384
3385uint32_t llama_vocab::n_tokens() const {
3386 return (uint32_t) pimpl->id_to_token.size();
3387}
3388
3389uint32_t llama_vocab::n_token_types() const {
3390 return (uint32_t) pimpl->n_token_types;
3391}
3392
3393std::string llama_vocab::type_name() const{
3394 return pimpl->type_name();
3395}
3396
3397bool llama_vocab::is_normal(llama_token id) const {
3398 return pimpl->is_normal(id);
3399}
3400
3401bool llama_vocab::is_unknown(llama_token id) const {
3402 return pimpl->is_unknown(id);
3403}
3404
3405bool llama_vocab::is_control(llama_token id) const {
3406 return pimpl->is_control(id);
3407}
3408
3409bool llama_vocab::is_byte(llama_token id) const {
3410 return pimpl->is_byte(id);
3411}
3412
3413bool llama_vocab::is_user_defined(llama_token id) const {
3414 return pimpl->is_user_defined(id);
3415}
3416
3417bool llama_vocab::is_unused(llama_token id) const {
3418 return pimpl->is_unused(id);
3419}
3420
3421bool llama_vocab::is_eog(llama_token id) const {
3422 return pimpl->is_eog(id);
3423}
3424
3425uint8_t llama_vocab::token_to_byte(llama_token id) const {
3426 return pimpl->token_to_byte(id);
3427}
3428
3429llama_token llama_vocab::byte_to_token(uint8_t ch) const {
3430 GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
3431 static const char * hex = "0123456789ABCDEF";
3432 switch (get_type()) {
3433 case LLAMA_VOCAB_TYPE_SPM:
3434 case LLAMA_VOCAB_TYPE_UGM: {
3435 const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
3436 auto token = pimpl->token_to_id.find(buf);
3437 if (token != pimpl->token_to_id.end()) {
3438 return (*token).second;
3439 }
3440 // Try to fall back to just the byte as a string
3441 const char buf2[2] = { (char)ch, 0 };
3442 return pimpl->token_to_id.at(buf2);
3443 }
3444 case LLAMA_VOCAB_TYPE_WPM:
3445 case LLAMA_VOCAB_TYPE_BPE: {
3446 return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
3447 }
3448 case LLAMA_VOCAB_TYPE_PLAMO2: {
3449 // PLaMo-2 uses byte tokens in format <0xXX>
3450 char hex_str[8];
3451 snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
3452 return pimpl->token_to_id.at(hex_str);
3453 }
3454 default:
3455 GGML_ABORT("fatal error");
3456 }
3457}
3458
3459llama_token llama_vocab::text_to_token(const std::string & text) const {
3460 GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3461 auto it = pimpl->token_to_id.find(text);
3462 if (it != pimpl->token_to_id.end()) {
3463 return (*it).second;
3464 }
3465 return LLAMA_TOKEN_NULL;
3466}
3467
3468const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
3469 GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3470 return pimpl->id_to_token.at(id);
3471}
3472
3473const char * llama_vocab::token_get_text(llama_token id) const {
3474 GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3475 return pimpl->id_to_token.at(id).text.c_str();
3476}
3477
3478float llama_vocab::token_get_score(llama_token id) const {
3479 GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3480 return pimpl->id_to_token.at(id).score;
3481}
3482
3483llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
3484 return pimpl->token_get_attr(id);
3485}
3486
3487llama_token llama_vocab::token_bos() const {
3488 return pimpl->special_bos_id;
3489}
3490
3491llama_token llama_vocab::token_eos() const {
3492 return pimpl->special_eos_id;
3493}
3494
3495llama_token llama_vocab::token_eot() const {
3496 return pimpl->special_eot_id;
3497}
3498
3499llama_token llama_vocab::token_eom() const {
3500 return pimpl->special_eom_id;
3501}
3502
3503llama_token llama_vocab::token_unk() const {
3504 return pimpl->special_unk_id;
3505}
3506
3507llama_token llama_vocab::token_sep() const {
3508 return pimpl->special_sep_id;
3509}
3510
3511llama_token llama_vocab::token_nl() const {
3512 return pimpl->linefeed_id;
3513}
3514
3515llama_token llama_vocab::token_pad() const {
3516 return pimpl->special_pad_id;
3517}
3518
3519llama_token llama_vocab::token_prefix() const {
3520 return pimpl->special_fim_pre_id;
3521}
3522
3523llama_token llama_vocab::token_middle() const {
3524 return pimpl->special_fim_mid_id;
3525}
3526
3527llama_token llama_vocab::token_suffix() const {
3528 return pimpl->special_fim_suf_id;
3529}
3530
3531llama_token llama_vocab::token_fim_pre() const {
3532 return pimpl->special_fim_pre_id;
3533}
3534
3535llama_token llama_vocab::token_fim_suf() const {
3536 return pimpl->special_fim_suf_id;
3537}
3538
3539llama_token llama_vocab::token_fim_mid() const {
3540 return pimpl->special_fim_mid_id;
3541}
3542
3543llama_token llama_vocab::token_fim_pad() const {
3544 return pimpl->special_fim_pad_id;
3545}
3546
3547llama_token llama_vocab::token_fim_rep() const {
3548 return pimpl->special_fim_rep_id;
3549}
3550
3551llama_token llama_vocab::token_fim_sep() const {
3552 return pimpl->special_fim_sep_id;
3553}
3554
3555llama_token llama_vocab::token_mask() const {
3556 return pimpl->special_mask_id;
3557}
3558
3559bool llama_vocab::get_add_space_prefix() const {
3560 return pimpl->add_space_prefix;
3561}
3562
3563bool llama_vocab::get_add_bos() const {
3564 return pimpl->add_bos;
3565}
3566
3567bool llama_vocab::get_add_eos() const {
3568 return pimpl->add_eos;
3569}
3570
3571bool llama_vocab::get_add_sep() const {
3572 return pimpl->add_sep;
3573}
3574
3575bool llama_vocab::get_ignore_merges() const {
3576 return pimpl->ignore_merges;
3577}
3578
3579bool llama_vocab::get_clean_spaces() const {
3580 return pimpl->clean_spaces;
3581}
3582
3583bool llama_vocab::get_remove_extra_whitespaces() const {
3584 return pimpl->remove_extra_whitespaces;
3585}
3586
3587bool llama_vocab::get_escape_whitespaces() const {
3588 return pimpl->escape_whitespaces;
3589}
3590
3591bool llama_vocab::get_treat_whitespace_as_suffix() const {
3592 return pimpl->treat_whitespace_as_suffix;
3593}
3594
3595int llama_vocab::max_token_len() const {
3596 return pimpl->max_token_len;
3597}
3598
3599int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
3600 GGML_ASSERT(token_left.find(' ') == std::string::npos);
3601 GGML_ASSERT(token_left.find('\n') == std::string::npos);
3602 GGML_ASSERT(token_right.find(' ') == std::string::npos);
3603 GGML_ASSERT(token_right.find('\n') == std::string::npos);
3604
3605 auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
3606 if (it == pimpl->bpe_ranks.end()) {
3607 return -1;
3608 }
3609
3610 return it->second;
3611}
3612
3613std::vector<std::string> llama_vocab::get_bpe_merges() const {
3614 std::vector<std::string> result(pimpl->bpe_ranks.size());
3615
3616 for (const auto & pair : pimpl->bpe_ranks) {
3617 result[pair.second] = pair.first.first + " " + pair.first.second;
3618 }
3619
3620 return result;
3621}
3622
3623std::vector<char> llama_vocab::get_precompiled_charsmap() const {
3624 return pimpl->precompiled_charsmap;
3625}
3626
3627int32_t llama_vocab::tokenize(
3628 const char * text,
3629 int32_t text_len,
3630 llama_token * tokens,
3631 int32_t n_tokens_max,
3632 bool add_special,
3633 bool parse_special) const {
3634 auto res = tokenize(std::string(text, text_len), add_special, parse_special);
3635 if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3636 LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3637 return std::numeric_limits<int32_t>::min();
3638 }
3639
3640 if (n_tokens_max < (int) res.size()) {
3641 // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3642 return -((int) res.size());
3643 }
3644
3645 for (size_t i = 0; i < res.size(); i++) {
3646 tokens[i] = res[i];
3647 }
3648
3649 return res.size();
3650}
3651
3652std::vector<llama_token> llama_vocab::tokenize(
3653 const std::string & raw_text,
3654 bool add_special,
3655 bool parse_special) const {
3656 return pimpl->tokenize(raw_text, add_special, parse_special);
3657}
3658
3659const std::string & llama_vocab::token_to_piece(llama_token token) const {
3660 return pimpl->token_to_piece(token);
3661}
3662
3663int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
3664 return pimpl->token_to_piece(token, buf, length, lstrip, special);
3665}
3666
3667int32_t llama_vocab::detokenize(
3668 const llama_token * tokens,
3669 int32_t n_tokens,
3670 char * text,
3671 int32_t text_len_max,
3672 bool remove_special,
3673 bool unparse_special) const {
3674 return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3675}
3676
3677std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
3678 std::string text;
3679 text.resize(std::max(text.capacity(), tokens.size()));
3680 int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
3681 if (n_chars < 0) {
3682 text.resize(-n_chars);
3683 n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
3684 GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
3685 }
3686
3687 text.resize(n_chars);
3688
3689 // NOTE: the original tokenizer decodes bytes after collecting the pieces.
3690 return text;
3691}
3692
3693void llama_vocab::print_info() const {
3694 pimpl->print_info();
3695}
3696
3697//
3698// interface implementation
3699//
3700
3701int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
3702 return vocab->n_tokens();
3703}
3704
3705// deprecated
3706int32_t llama_n_vocab(const struct llama_vocab * vocab) {
3707 return llama_vocab_n_tokens(vocab);
3708}
3709
3710enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
3711 return vocab->get_type();
3712}
3713
3714const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
3715 return vocab->token_get_text(token);
3716}
3717
3718float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
3719 return vocab->token_get_score(token);
3720}
3721
3722enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
3723 return vocab->token_get_attr(token);
3724}
3725
3726bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
3727 return vocab->is_eog(token);
3728}
3729
3730bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
3731 return vocab->is_control(token);
3732}
3733
3734llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
3735 return vocab->token_bos();
3736}
3737
3738llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
3739 return vocab->token_eos();
3740}
3741
3742llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
3743 return vocab->token_eot();
3744}
3745
3746// deprecated
3747llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
3748 return vocab->token_bos();
3749}
3750
3751llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
3752 return vocab->token_sep();
3753}
3754
3755llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
3756 return vocab->token_nl();
3757}
3758
3759llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
3760 return vocab->token_pad();
3761}
3762
3763bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
3764 return vocab->get_add_bos();
3765}
3766
3767bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3768 return vocab->get_add_eos();
3769}
3770
3771bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3772 return vocab->get_add_sep();
3773}
3774
3775llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3776 return vocab->token_fim_pre();
3777}
3778
3779llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
3780 return vocab->token_fim_suf();
3781}
3782
3783llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
3784 return vocab->token_fim_mid();
3785}
3786
3787llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
3788 return vocab->token_fim_pad();
3789}
3790
3791llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
3792 return vocab->token_fim_rep();
3793}
3794
3795llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
3796 return vocab->token_fim_sep();
3797}
3798
3799llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
3800 return vocab->token_mask();
3801}
3802
3803// deprecated
3804const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
3805 return llama_vocab_get_text(vocab, token);
3806}
3807
3808// deprecated
3809float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
3810 return llama_vocab_get_score(vocab, token);
3811}
3812
3813// deprecated
3814enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
3815 return llama_vocab_get_attr(vocab, token);
3816}
3817
3818// deprecated
3819bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
3820 return llama_vocab_is_eog(vocab, token);
3821}
3822
3823// deprecated
3824bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
3825 return llama_vocab_is_control(vocab, token);
3826}
3827
3828// deprecated
3829llama_token llama_token_bos(const struct llama_vocab * vocab) {
3830 return llama_vocab_bos(vocab);
3831}
3832
3833// deprecated
3834llama_token llama_token_eos(const struct llama_vocab * vocab) {
3835 return llama_vocab_eos(vocab);
3836}
3837
3838// deprecated
3839llama_token llama_token_eot(const struct llama_vocab * vocab) {
3840 return llama_vocab_eot(vocab);
3841}
3842
3843// deprecated
3844llama_token llama_token_cls(const struct llama_vocab * vocab) {
3845 //return llama_vocab_cls(vocab);
3846 return llama_vocab_bos(vocab); // avoid deprecation warning
3847}
3848
3849// deprecated
3850llama_token llama_token_sep(const struct llama_vocab * vocab) {
3851 return llama_vocab_sep(vocab);
3852}
3853
3854// deprecated
3855llama_token llama_token_nl (const struct llama_vocab * vocab) {
3856 return llama_vocab_nl(vocab);
3857}
3858
3859// deprecated
3860llama_token llama_token_pad(const struct llama_vocab * vocab) {
3861 return llama_vocab_pad(vocab);
3862}
3863
3864// deprecated
3865bool llama_add_bos_token(const struct llama_vocab * vocab) {
3866 return llama_vocab_get_add_bos(vocab);
3867}
3868
3869// deprecated
3870bool llama_add_eos_token(const struct llama_vocab * vocab) {
3871 return llama_vocab_get_add_eos(vocab);
3872}
3873
3874// deprecated
3875llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
3876 return llama_vocab_fim_pre(vocab);
3877}
3878
3879// deprecated
3880llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
3881 return llama_vocab_fim_suf(vocab);
3882}
3883
3884// deprecated
3885llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
3886 return llama_vocab_fim_mid(vocab);
3887}
3888
3889// deprecated
3890llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
3891 return llama_vocab_fim_pad(vocab);
3892}
3893
3894// deprecated
3895llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
3896 return llama_vocab_fim_rep(vocab);
3897}
3898
3899// deprecated
3900llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
3901 return llama_vocab_fim_sep(vocab);
3902}
3903
3904//
3905// tokenization
3906//
3907
3908int32_t llama_tokenize(
3909 const struct llama_vocab * vocab,
3910 const char * text,
3911 int32_t text_len,
3912 llama_token * tokens,
3913 int32_t n_tokens_max,
3914 bool add_special,
3915 bool parse_special) {
3916 return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
3917}
3918
3919int32_t llama_token_to_piece(
3920 const struct llama_vocab * vocab,
3921 llama_token token,
3922 char * buf,
3923 int32_t length,
3924 int32_t lstrip,
3925 bool special) {
3926 return vocab->token_to_piece(token, buf, length, lstrip, special);
3927}
3928
3929int32_t llama_detokenize(
3930 const struct llama_vocab * vocab,
3931 const llama_token * tokens,
3932 int32_t n_tokens,
3933 char * text,
3934 int32_t text_len_max,
3935 bool remove_special,
3936 bool unparse_special) {
3937 return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3938}