From b333b06772c89d96aacb5490d6a219fba7c09cc6 Mon Sep 17 00:00:00 2001 From: Mitja Felicijan Date: Thu, 12 Feb 2026 20:57:17 +0100 Subject: Engage! --- llama.cpp/tests/test-tokenizer-0.sh | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100755 llama.cpp/tests/test-tokenizer-0.sh (limited to 'llama.cpp/tests/test-tokenizer-0.sh') diff --git a/llama.cpp/tests/test-tokenizer-0.sh b/llama.cpp/tests/test-tokenizer-0.sh new file mode 100755 index 0000000..7ef009d --- /dev/null +++ b/llama.cpp/tests/test-tokenizer-0.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# +# Usage: +# +# test-tokenizer-0.sh +# + +if [ $# -ne 2 ]; then + printf "Usage: $0 \n" + exit 1 +fi + +name=$1 +input=$2 + +make -j tests/test-tokenizer-0 + +printf "Testing %s on %s ...\n" $name $input + +set -e + +printf "Tokenizing using (py) Python AutoTokenizer ...\n" +python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 + +printf "Tokenizing using (cpp) llama.cpp ...\n" +./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 + +cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" +cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" + +set +e + +diff $input.tok $input.tokcpp > /dev/null 2>&1 + +if [ $? -eq 0 ]; then + printf "Tokenization is correct!\n" +else + diff $input.tok $input.tokcpp | head -n 32 + + printf "Tokenization differs!\n" +fi -- cgit v1.2.3