1#!/usr/bin/env python3
  2# -*- coding: utf-8 -*-
  3
  4import logging
  5import os
  6import pathlib
  7import re
  8
  9import requests
 10import json
 11import shutil
 12import argparse
 13
 14from hashlib import sha256
 15from enum import IntEnum, auto
 16from transformers import AutoTokenizer
 17
 18logging.basicConfig(level=logging.DEBUG)
 19logger = logging.getLogger("convert_hf_to_gguf_update")
 20sess = requests.Session()
 21
 22convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
 23convert_py = convert_py_pth.read_text(encoding="utf-8")
 24hf_token_pth = pathlib.Path.home() / ".cache" / "huggingface" / "token"
 25hf_token = hf_token_pth.read_text(encoding="utf-8").strip() if hf_token_pth.exists() else None
 26
 27
 28class TOKENIZER_TYPE(IntEnum):
 29    SPM = auto()
 30    BPE = auto()
 31    WPM = auto()
 32    UGM = auto()
 33
 34
 35DOC_STRING = """
 36This script downloads the tokenizer models of the specified models from Huggingface and
 37generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
 38
 39/!\\ It is intended to be used by contributors and is not meant to be run by end users
 40
 41This is necessary in order to analyze the type of pre-tokenizer used by the model and
 42provide the necessary information to llama.cpp via the GGUF header in order to implement
 43the same pre-tokenizer.
 44
 45ref: https://github.com/ggml-org/llama.cpp/pull/6920
 46
 47Instructions:
 48
 49- Add a new model to the "models" list
 50- Run the script with your huggingface token
 51    By default, token will be read from ~/.cache/huggingface/token
 52- The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
 53- Update llama.cpp with the new pre-tokenizer if necessary
 54"""
 55# TODO: generate tokenizer tests for llama.cpp
 56
 57parser = argparse.ArgumentParser(description=DOC_STRING, formatter_class=argparse.RawTextHelpFormatter)
 58parser.add_argument(
 59    "--full", action="store_true",
 60    help="download full list of models - make sure you have access to all of them",
 61)
 62parser.add_argument(
 63    "--check-missing", action="store_true",
 64    help="only check for missing pre-tokenizer hashes",
 65)
 66parser.add_argument(
 67    "hf_token",
 68    help="optional HF token",
 69    nargs="?",
 70)
 71args = parser.parse_args()
 72hf_token = args.hf_token if args.hf_token is not None else hf_token
 73
 74if hf_token is None:
 75    logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
 76
 77if args.check_missing and args.full:
 78    logger.warning("Downloading full list of models requested, ignoring --check-missing!")
 79    args.check_missing = False
 80
 81# TODO: this string has to exercise as much pre-tokenizer functionality as possible
 82#       will be updated with time - contributions welcome
 83CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 84
 85# TODO: add models here, base models preferred
 86models = [
 87    {"name": "llama-spm",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
 88    {"name": "llama-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
 89    {"name": "phi-3",            "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
 90    {"name": "deepseek-llm",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
 91    {"name": "deepseek-coder",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
 92    {"name": "falcon",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
 93    {"name": "bert-bge",         "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
 94    {"name": "falcon3",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
 95    {"name": "bert-bge-large",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
 96    {"name": "mpt",              "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
 97    {"name": "starcoder",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
 98    {"name": "gpt-2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
 99    {"name": "stablelm2",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
100    {"name": "refact",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
101    {"name": "command-r",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
102    {"name": "qwen2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
103    {"name": "olmo",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
104    {"name": "dbrx",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
105    {"name": "jina-v1-en",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
106    {"name": "jina-v2-en",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
107    {"name": "jina-v2-es",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
108    {"name": "jina-v2-de",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
109    {"name": "smaug-bpe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
110    {"name": "poro-chat",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
111    {"name": "jina-v2-code",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
112    {"name": "viking",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
113    {"name": "gemma",            "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
114    {"name": "gemma-2",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
115    {"name": "jais",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
116    {"name": "t5",               "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
117    {"name": "codeshell",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
118    {"name": "tekken",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
119    {"name": "smollm",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
120    {'name': "bloom",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
121    {'name': "gpt3-finnish",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
122    {"name": "exaone",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
123    {"name": "phi-2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
124    {"name": "chameleon",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
125    {"name": "roberta-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
126    {"name": "gigachat",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
127    {"name": "megrez",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
128    {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
129    {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
130    {"name": "gpt-4o",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
131    {"name": "superbpe",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
132    {"name": "trillion",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
133    {"name": "bailingmoe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
134    {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
135    {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
136    {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
137    {"name": "a.x-4.0",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
138    {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
139    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
140    {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
141    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
142    {"name": "modern-bert",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/answerdotai/ModernBERT-base", },
143    {"name": "afmoe",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/arcee-ai/Trinity-Tokenizer", },
144    {"name": "bailingmoe2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
145    {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
146    {"name": "minimax-m2",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
147    {"name": "kormo",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
148    {"name": "youtu",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Youtu-LLM-2B", },
149    {"name": "solar-open",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/upstage/Solar-Open-100B", },
150    {"name": "exaone-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", },
151    {"name": "qwen35",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", }
152]
153
154# some models are known to be broken upstream, so we will skip them as exceptions
155pre_computed_hashes = [
156    # chatglm-bpe has 2 hashes, why?
157    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
158    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
159    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
160    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902"},
161    {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
162    {"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
163    {"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
164    # falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
165    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
166    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
167    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
168    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
169    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
170    {"name": "qwen2",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
171    {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
172    # jina-v2-de variants
173    {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
174    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
175]
176
177
178def download_file_with_auth(url, token, save_path):
179    headers = {"Authorization": f"Bearer {token}"} if token else None
180    response = sess.get(url, headers=headers)
181    response.raise_for_status()
182    os.makedirs(os.path.dirname(save_path), exist_ok=True)
183    with open(save_path, 'wb') as downloaded_file:
184        downloaded_file.write(response.content)
185    logger.info(f"File {save_path} downloaded successfully")
186
187
188def download_model(model):
189    name = model["name"]
190    repo = model["repo"]
191    tokt = model["tokt"]
192
193    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
194
195    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
196
197    if name == "gpt-4o":
198        # Xenova/gpt-4o is tokenizer-only, it does not contain config.json
199        files = ["tokenizer.json", "tokenizer_config.json"]
200
201    if tokt == TOKENIZER_TYPE.SPM:
202        files.append("tokenizer.model")
203
204    if tokt == TOKENIZER_TYPE.UGM:
205        files.append("spiece.model")
206
207    if os.path.isdir(repo):
208        # If repo is a path on the file system, copy the directory
209        for file in files:
210            src_path = os.path.join(repo, file)
211            dst_path = f"models/tokenizers/{name}/{file}"
212            if os.path.isfile(dst_path):
213                logger.info(f"{name}: File {dst_path} already exists - skipping")
214                continue
215            if os.path.isfile(src_path):
216                shutil.copy2(src_path, dst_path)
217                logger.info(f"{name}: Copied {src_path} to {dst_path}")
218            else:
219                logger.warning(f"{name}: Source file {src_path} does not exist")
220    else:
221        # If repo is a URL, download the files
222        for file in files:
223            save_path = f"models/tokenizers/{name}/{file}"
224            if os.path.isfile(save_path):
225                logger.info(f"{name}: File {save_path} already exists - skipping")
226                continue
227            download_file_with_auth(f"{repo}/resolve/main/{file}", hf_token, save_path)
228
229
230# get list of existing models and chkhsh from the convert_hf_to_gguf.py file
231# returns mapping res --> chkhsh
232def get_existing_models(convert_py):
233    pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
234    matches = re.findall(pattern, convert_py)
235    output = {}
236    for chkhsh, res in matches:
237        output[res] = chkhsh
238    return output
239
240
241existing_models = {}
242all_models = models.copy()
243if not args.full:
244    # Filter out models that already exist in convert_hf_to_gguf.py
245    existing_models = get_existing_models(convert_py)
246    all_models = models.copy()
247    models = [model for model in all_models if model["name"] not in existing_models]
248
249if not args.check_missing:
250    logging.info(f"Downloading {len(models)} models...")
251    for model in models:
252        try:
253            download_model(model)
254        except Exception as e:
255            logger.error(f"Failed to download model {model['name']}. Error: {e}")
256
257
258# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
259
260src_ifs = ""
261for model in [*pre_computed_hashes, *all_models]:
262    name = model["name"]
263    tokt = model["tokt"]
264    chkhsh = model.get("chkhsh")
265
266    if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
267        continue
268
269    # create the tokenizer
270    if chkhsh is not None:
271        # if the model has a pre-computed hash, use it
272        logger.info(f"Using pre-computed hash for model {name}: {chkhsh}")
273    elif name in existing_models:
274        # if the model already exists in convert_hf_to_gguf.py, skip compute hash
275        chkhsh = existing_models[name]
276    else:
277        # otherwise, compute the hash of the tokenizer
278
279        # Fail if the tokenizer folder with config does not exist or there are other download issues previously
280        if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
281            raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.")
282
283        try:
284            logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
285            if name == "t5":
286                tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
287            else:
288                tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
289        except Exception as e:
290            raise OSError(f"Error loading tokenizer for model {name}.") from e
291
292        chktok = tokenizer.encode(CHK_TXT)
293        chkhsh = sha256(str(chktok).encode()).hexdigest()
294
295        logger.info(f"model: {name}")
296        logger.info(f"tokt: {tokt}")
297        logger.info(f"repo: {model['repo']}")
298        logger.info(f"chktok: {chktok}")
299        logger.info(f"chkhsh: {chkhsh}")
300
301        # print the "pre_tokenizer" content from the tokenizer.json
302        with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
303            cfg = json.load(f)
304            normalizer = cfg["normalizer"]
305            logger.info("normalizer: " + json.dumps(normalizer, indent=4))
306            pre_tokenizer = cfg["pre_tokenizer"]
307            logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
308            if "ignore_merges" in cfg["model"]:
309                logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
310
311        logger.info("")
312
313    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
314    src_ifs += f"            # ref: {model['repo']}\n"
315    src_ifs += f"            res = \"{name}\"\n"
316
317src_func = f"""
318    def get_vocab_base_pre(self, tokenizer) -> str:
319        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
320        # is specific for the BPE pre-tokenizer used by the model
321        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
322        # use in llama.cpp to implement the same pre-tokenizer
323
324        chktxt = {repr(CHK_TXT)}
325
326        chktok = tokenizer.encode(chktxt)
327        chkhsh = sha256(str(chktok).encode()).hexdigest()
328
329        logger.debug(f"chktok: {{chktok}}")
330        logger.debug(f"chkhsh: {{chkhsh}}")
331
332        res = None
333
334        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
335        #       or pull the latest version of the model from Huggingface
336        #       don't edit the hashes manually!
337{src_ifs}
338        if res is None:
339            logger.warning("\\n")
340            logger.warning("**************************************************************************************")
341            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
342            logger.warning("**          There are 2 possible reasons for this:")
343            logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
344            logger.warning("**          - the pre-tokenization config has changed upstream")
345            logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
346            logger.warning("** ref:     https://github.com/ggml-org/llama.cpp/pull/6920")
347            logger.warning("**")
348            logger.warning(f"** chkhsh:  {{chkhsh}}")
349            logger.warning("**************************************************************************************")
350            logger.warning("\\n")
351            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
352
353        logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
354        logger.debug(f"chkhsh: {{chkhsh}}")
355
356        return res
357"""
358
359convert_py = re.sub(
360    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
361    lambda m: m.group(1) + src_func + m.group(3),
362    convert_py,
363    flags=re.DOTALL | re.MULTILINE,
364)
365
366convert_py_pth.write_text(convert_py, encoding="utf-8")
367
368logger.info("+++ convert_hf_to_gguf.py was updated")
369
370# generate tests for each tokenizer model
371
372tests = [
373    "ied 4 ½ months",
374    "Äpfel",
375    "",
376    " ",
377    "  ",
378    "   ",
379    "\t",
380    "\n",
381    "\n\n",
382    "\n\n\n",
383    "\t\n",
384    "Hello world",
385    " Hello world",
386    "Hello World",
387    " Hello World",
388    " Hello World!",
389    "Hello, world!",
390    " Hello, world!",
391    " this is 🦙.cpp",
392    "w048 7tuijk dsdfhu",
393    "нещо на Български",
394    "កាន់តែពិសេសអាចខលចេញ",
395    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
396    "Hello",
397    " Hello",
398    "  Hello",
399    "   Hello",
400    "    Hello",
401    "    Hello\n    Hello",
402    " (",
403    "\n =",
404    "' era",
405    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
406    "!!!!!!",
407    "3",
408    "33",
409    "333",
410    "3333",
411    "33333",
412    "333333",
413    "3333333",
414    "33333333",
415    "333333333",
416    "Cửa Việt", # llama-bpe fails on this
417    " discards",
418    CHK_TXT,
419]
420
421# write the tests to ./models/ggml-vocab-{name}.gguf.inp
422# the format is:
423#
424# test0
425# __ggml_vocab_test__
426# test1
427# __ggml_vocab_test__
428# ...
429#
430
431# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
432# for each test, write the resulting tokens on a separate line
433
434for model in models:
435    name = model["name"]
436    tokt = model["tokt"]
437
438    # Skip if the tokenizer folder does not exist or there are other download issues previously
439    if not os.path.exists(f"models/tokenizers/{name}"):
440        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
441        continue
442
443    # create the tokenizer
444    try:
445        if name == "t5":
446            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
447        else:
448            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
449    except (OSError, TypeError) as e:
450        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
451        continue  # Skip this model and continue with the next one in the loop
452
453    if not os.path.exists(f"models/ggml-vocab-{name}.gguf"):
454        logger.info(f"Skip vocab files for model {name}, no GGUF file found")
455        continue
456
457    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
458        for text in tests:
459            f.write(f"{text}")
460            f.write("\n__ggml_vocab_test__\n")
461
462    with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
463        for text in tests:
464            res = tokenizer.encode(text, add_special_tokens=False)
465            for r in res:
466                f.write(f" {r}")
467            f.write("\n")
468
469    logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
470
471# generate commands for creating vocab files
472
473logger.info("\nRun the following commands to generate the vocab files for testing:\n")
474
475for model in models:
476    name = model["name"]
477
478    print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
479
480logger.info("\n")