1#!/usr/bin/env bash
 2#
 3# Usage:
 4#
 5#   test-tokenizer-0.sh <name> <input>
 6#
 7
 8if [ $# -ne 2 ]; then
 9    printf "Usage: $0 <name> <input>\n"
10    exit 1
11fi
12
13name=$1
14input=$2
15
16make -j tests/test-tokenizer-0
17
18printf "Testing %s on %s ...\n" $name $input
19
20set -e
21
22printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
23python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
24
25printf "Tokenizing using (cpp) llama.cpp ...\n"
26./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
27
28cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
29cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
30
31set +e
32
33diff $input.tok $input.tokcpp > /dev/null 2>&1
34
35if [ $? -eq 0 ]; then
36    printf "Tokenization is correct!\n"
37else
38    diff $input.tok $input.tokcpp | head -n 32
39
40    printf "Tokenization differs!\n"
41fi