1#!/usr/bin/env bash
 2
 3set -e
 4
 5# Parse command line arguments
 6MODEL_PATH=""
 7MODEL_NAME=""
 8PROMPTS_FILE=""
 9
10# First argument is always model path
11if [ $# -gt 0 ] && [[ "$1" != --* ]]; then
12    MODEL_PATH="$1"
13    shift
14fi
15
16# Parse remaining arguments
17while [[ $# -gt 0 ]]; do
18    case $1 in
19        --prompts-file|-pf)
20            PROMPTS_FILE="$2"
21            shift 2
22            ;;
23        *)
24            # If MODEL_NAME not set and this isn't a flag, use as model name
25            if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then
26                MODEL_NAME="$1"
27            fi
28            shift
29            ;;
30    esac
31done
32
33# Set defaults
34MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
35MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
36
37CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}"
38CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}"
39
40if [ -t 0 ]; then
41    CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin"
42else
43    # Process piped JSON data and convert to binary (matching logits.cpp format)
44    TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn)
45    python3 -c "
46import json
47import sys
48import struct
49
50data = json.load(sys.stdin)
51
52# Flatten all embeddings completely
53flattened = []
54for item in data:
55    embedding = item['embedding']
56    for token_embedding in embedding:
57        flattened.extend(token_embedding)
58
59print(f'Total embedding values: {len(flattened)}', file=sys.stderr)
60
61# Write as binary floats - matches logitc.cpp fwrite format
62with open('$TEMP_FILE', 'wb') as f:
63    for value in flattened:
64        f.write(struct.pack('f', value))
65"
66    CPP_EMBEDDINGS="$TEMP_FILE"
67    trap "rm -f $TEMP_FILE" EXIT
68fi
69
70# Build the semantic_check.py command
71SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
72    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
73    --cpp-embeddings $CPP_EMBEDDINGS"
74
75# Add prompts file if specified, otherwise use default prompt
76if [ -n "$PROMPTS_FILE" ]; then
77    SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\""
78else
79    SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\""
80fi
81
82# Execute the command
83eval $SEMANTIC_CMD
84