llmnpc - llama.cpp/examples/model-conversion/scripts/utils/semantic

Path: llmnpc / llama.cpp / examples / model-conversion / scripts / utils / semantic_check.py (raw)
  1#!/usr/bin/env python3
  2
  3import numpy as np
  4import argparse
  5import os
  6import importlib
  7from pathlib import Path
  8
  9from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
 10from common import compare_tokens, exit_with_warning  # type: ignore[import-not-found]
 11
 12unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
 13
 14def cosine_similarity(a, b=None):
 15    a = np.asarray(a)
 16    if b is None:
 17        b = a
 18    else:
 19        b = np.asarray(b)
 20
 21    if a.ndim == 1:
 22        a = a.reshape(1, -1)
 23    if b.ndim == 1:
 24        b = b.reshape(1, -1)
 25
 26    a_norms = np.linalg.norm(a, axis=1, keepdims=True)
 27    b_norms = np.linalg.norm(b, axis=1, keepdims=True)
 28
 29    a_norms = np.where(a_norms == 0, 1e-8, a_norms)
 30    b_norms = np.where(b_norms == 0, 1e-8, b_norms)
 31
 32    a_normalized = a / a_norms
 33    b_normalized = b / b_norms
 34
 35    # Compute cosine similarity
 36    return np.dot(a_normalized, b_normalized.T)
 37
 38def load_embeddings_from_file(filename, n_tokens, n_embd):
 39    embeddings = np.fromfile(filename, dtype=np.float32)
 40    # Check if this is pooled (single embedding) or per-token embeddings
 41    if len(embeddings) == n_embd:
 42        return embeddings.reshape(1, n_embd)
 43    else:
 44        return embeddings.reshape(n_tokens, n_embd)
 45
 46def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
 47    np.set_printoptions(suppress=True, precision=6)
 48    print("pytorch embeddings:");
 49    print(python_emb)
 50    print("llama.cpp embeddings:");
 51    print(cpp_emb)
 52    print(f"\n=== Prompt: '{prompt}' ===")
 53    print(f"Tokens: {tokens}")
 54    print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")
 55
 56    n_tokens = len(tokens)
 57    is_pooled = python_emb.shape[0] == 1
 58
 59    if is_pooled:
 60        print(f"\n[Pooled Embeddings Mode - comparing single sentence embeddings]")
 61
 62        # 1. Direct embedding comparison for pooled embeddings
 63        print(f"\n1. Raw Embedding Magnitude Comparison:")
 64        py_mag = np.linalg.norm(python_emb[0])
 65        cpp_mag = np.linalg.norm(cpp_emb[0])
 66        ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
 67        print(f"   Pooled embedding: Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
 68
 69        # 2. Cross-model similarity for pooled embeddings
 70        print(f"\n2. Cross-Model Pooled Embedding Similarity:")
 71        sim = cosine_similarity([python_emb[0]], [cpp_emb[0]])[0][0]
 72        print(f"   Cosine similarity: {sim:.6f}")
 73
 74        return {
 75            'cross_model_similarities': [sim],
 76            'similarity_matrix_diff': np.array([[0.0]]),
 77            'max_diff': 0.0,
 78            'mean_diff': 0.0,
 79            'rms_diff': 0.0
 80        }
 81    else:
 82        # Original per-token comparison logic
 83        # 1. Direct embedding comparison
 84        print(f"\n1. Raw Embedding Magnitude Comparison:")
 85        # Check if the distance of each token embedding from the origin and compare
 86        # if the vectors are on the same "sphere". This does not tell us about
 87        # direction (meaning of the token embedding), just magnitude.
 88        for i in range(n_tokens):
 89            py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
 90            cpp_mag = np.linalg.norm(cpp_emb[i])   # calculate standard euclidean norm for llama.cpp embeddings
 91            ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
 92            print(f"   Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
 93
 94        # 2. Cosine similarity between tokens within each model
 95        # Here we check the direction of token embeddings to see if the have the
 96        # same meaning (similarity). This is done by calculating cosine similarity
 97        # of a pair of token embeddings within each model.
 98        print(f"\n2. Within-Model Token Similarities:")
 99        print("   Python model:")
100        for i in range(n_tokens):
101            for j in range(i+1, n_tokens):
102                sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
103                print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
104
105        print("   llama.cpp model:")
106        for i in range(n_tokens):
107            for j in range(i+1, n_tokens):
108                sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
109                print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
110
111        # 3. Cross-model similarity (same token position)
112        print(f"\n3. Cross-Model Same-Token Similarities:")
113        for i in range(n_tokens):
114            sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
115            print(f"   Token {i} ({tokens[i]}): {sim:.4f}")
116
117        # 4. Similarity matrix comparison
118        print(f"\n4. Similarity Matrix Differences:")
119        py_sim_matrix = cosine_similarity(python_emb)
120        cpp_sim_matrix = cosine_similarity(cpp_emb)
121        diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
122
123        print(f"   Max difference: {np.max(diff_matrix):.4f}")
124        print(f"   Mean difference: {np.mean(diff_matrix):.4f}")
125        print(f"   RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
126
127        return {
128            'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
129            'similarity_matrix_diff': diff_matrix,
130            'max_diff': np.max(diff_matrix),
131            'mean_diff': np.mean(diff_matrix),
132            'rms_diff': np.sqrt(np.mean(diff_matrix**2))
133        }
134
135def read_prompt_from_file(file_path):
136    try:
137        with open(file_path, 'r', encoding='utf-8') as f:
138            return f.read().strip()
139    except FileNotFoundError:
140        print(f"Error: Prompts file '{file_path}' not found")
141        exit(1)
142    except Exception as e:
143        print(f"Error reading prompts file: {e}")
144        exit(1)
145
146def main():
147    parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
148    parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
149    parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file')
150    parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
151    parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
152    parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
153    parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts')
154
155    args = parser.parse_args()
156
157    if args.prompts_file:
158        prompt = read_prompt_from_file(args.prompts_file)
159    else:
160        prompt = args.prompt
161
162    python_emb_path = Path(args.python_embeddings)
163    cpp_emb_path = Path(args.cpp_embeddings)
164
165    # Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name")
166    python_model_name = python_emb_path.stem.replace("-embeddings", "")
167    cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "")
168
169    print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
170    print("=" * 70)
171
172    # First verify tokens match before comparing embeddings
173    print("\n🔍 Token Comparison Check")
174    print("=" * 70)
175    data_dir = python_emb_path.parent
176    if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)):
177        exit_with_warning("\n❌ Token mismatch detected", args.model_path)
178    print()
179
180    # Single prompt detailed comparison
181    print(f"\nTesting with prompt: '{prompt}'")
182
183    # Load the python model to get configuration information and also to load the tokenizer.
184    print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
185    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
186    config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
187
188    if unreleased_model_name:
189        model_name_lower = unreleased_model_name.lower()
190        unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
191        if args.causal:
192            class_name = f"{unreleased_model_name}ForCausalLM"
193        else:
194            class_name = f"{unreleased_model_name}Model"
195        print(f"Model class: {class_name}")
196        print(f"Importing unreleased model module: {unreleased_module_path}")
197
198        try:
199            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
200            model = model_class.from_pretrained(args.model_path)
201        except (ImportError, AttributeError) as e:
202            print(f"Failed to import or load model: {e}")
203            exit(1)
204    else:
205        if args.causal:
206            model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True)
207        else:
208            model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
209
210    encoded = tokenizer(prompt, return_tensors="pt")
211    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
212    n_tokens = len(tokens)
213    print(f"n_tokens: {n_tokens}");
214    print(f"hidden_size: {model.config.hidden_size}")
215
216    # Load binary embeddings from data directory.
217    llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size)
218    python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
219
220    # Run comparison
221    results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt)
222
223    # Summary
224    print(f"\n=== SUMMARY ===")
225    avg_cross_sim = np.mean(results['cross_model_similarities'])
226    print(f"Average cross-model similarity: {avg_cross_sim:.4f}")
227    print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}")
228
229    # Quality assessment
230    if avg_cross_sim > 0.95:
231        print("✅ EXCELLENT: Models are highly similar")
232    elif avg_cross_sim > 0.90:
233        print("✅ VERY GOOD: Models are very similar")
234    elif avg_cross_sim > 0.80:
235        print("⚠️  GOOD: Models are reasonably similar")
236    elif avg_cross_sim > 0.70:
237        print("⚠️  FAIR: Models have some differences")
238    else:
239        exit_with_warning("❌ POOR: Models are significantly different", args.model_path)
240
241if __name__ == "__main__":
242    main()