1#!/usr/bin/env bash
2
3set -e
4
5# Parse command line arguments
6MODEL_PATH=""
7MODEL_NAME=""
8PROMPTS_FILE=""
9
10# First argument is always model path
11if [ $# -gt 0 ] && [[ "$1" != --* ]]; then
12 MODEL_PATH="$1"
13 shift
14fi
15
16# Parse remaining arguments
17while [[ $# -gt 0 ]]; do
18 case $1 in
19 --prompts-file|-pf)
20 PROMPTS_FILE="$2"
21 shift 2
22 ;;
23 *)
24 # If MODEL_NAME not set and this isn't a flag, use as model name
25 if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then
26 MODEL_NAME="$1"
27 fi
28 shift
29 ;;
30 esac
31done
32
33# Set defaults
34MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
35MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
36
37CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}"
38CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}"
39
40if [ -t 0 ]; then
41 CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
42else
43 # Process piped JSON data and convert to binary (matching logits.cpp format)
44 TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
45 python3 -c "
46import json
47import sys
48import struct
49
50data = json.load(sys.stdin)
51
52# Flatten all embeddings completely
53flattened = []
54for item in data:
55 embedding = item['embedding']
56 for token_embedding in embedding:
57 flattened.extend(token_embedding)
58
59print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
60
61# Write as binary floats - matches logitc.cpp fwrite format
62with open('$TEMP_FILE', 'wb') as f:
63 for value in flattened:
64 f.write(struct.pack('f', value))
65"
66 CPP_EMBEDDINGS="$TEMP_FILE"
67 trap "rm -f $TEMP_FILE" EXIT
68fi
69
70# Build the semantic_check.py command
71SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
72 --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
73 --cpp-embeddings $CPP_EMBEDDINGS"
74
75# Add prompts file if specified, otherwise use default prompt
76if [ -n "$PROMPTS_FILE" ]; then
77 SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\""
78else
79 SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\""
80fi
81
82# Execute the command
83eval $SEMANTIC_CMD
84