diff options
Diffstat (limited to 'llama.cpp/tests/test-tokenizer-0.sh')
| -rwxr-xr-x | llama.cpp/tests/test-tokenizer-0.sh | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/llama.cpp/tests/test-tokenizer-0.sh b/llama.cpp/tests/test-tokenizer-0.sh new file mode 100755 index 0000000..7ef009d --- /dev/null +++ b/llama.cpp/tests/test-tokenizer-0.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# +# Usage: +# +# test-tokenizer-0.sh <name> <input> +# + +if [ $# -ne 2 ]; then + printf "Usage: $0 <name> <input>\n" + exit 1 +fi + +name=$1 +input=$2 + +make -j tests/test-tokenizer-0 + +printf "Testing %s on %s ...\n" $name $input + +set -e + +printf "Tokenizing using (py) Python AutoTokenizer ...\n" +python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 + +printf "Tokenizing using (cpp) llama.cpp ...\n" +./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 + +cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" +cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" + +set +e + +diff $input.tok $input.tokcpp > /dev/null 2>&1 + +if [ $? -eq 0 ]; then + printf "Tokenization is correct!\n" +else + diff $input.tok $input.tokcpp | head -n 32 + + printf "Tokenization differs!\n" +fi |
