summaryrefslogtreecommitdiff
path: root/llama.cpp/tests/test-tokenizer-0.sh
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/tests/test-tokenizer-0.sh
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/tests/test-tokenizer-0.sh')
-rwxr-xr-xllama.cpp/tests/test-tokenizer-0.sh41
1 files changed, 41 insertions, 0 deletions
diff --git a/llama.cpp/tests/test-tokenizer-0.sh b/llama.cpp/tests/test-tokenizer-0.sh
new file mode 100755
index 0000000..7ef009d
--- /dev/null
+++ b/llama.cpp/tests/test-tokenizer-0.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+#
+# Usage:
+#
+# test-tokenizer-0.sh <name> <input>
+#
+
+if [ $# -ne 2 ]; then
+ printf "Usage: $0 <name> <input>\n"
+ exit 1
+fi
+
+name=$1
+input=$2
+
+make -j tests/test-tokenizer-0
+
+printf "Testing %s on %s ...\n" $name $input
+
+set -e
+
+printf "Tokenizing using (py) Python AutoTokenizer ...\n"
+python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
+
+printf "Tokenizing using (cpp) llama.cpp ...\n"
+./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
+
+cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
+cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
+
+set +e
+
+diff $input.tok $input.tokcpp > /dev/null 2>&1
+
+if [ $? -eq 0 ]; then
+ printf "Tokenization is correct!\n"
+else
+ diff $input.tok $input.tokcpp | head -n 32
+
+ printf "Tokenization differs!\n"
+fi