1#!/usr/bin/env python3
2
3import numpy as np
4import argparse
5import os
6import importlib
7from pathlib import Path
8
9from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
10from common import compare_tokens, exit_with_warning # type: ignore[import-not-found]
11
12unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
13
14def cosine_similarity(a, b=None):
15 a = np.asarray(a)
16 if b is None:
17 b = a
18 else:
19 b = np.asarray(b)
20
21 if a.ndim == 1:
22 a = a.reshape(1, -1)
23 if b.ndim == 1:
24 b = b.reshape(1, -1)
25
26 a_norms = np.linalg.norm(a, axis=1, keepdims=True)
27 b_norms = np.linalg.norm(b, axis=1, keepdims=True)
28
29 a_norms = np.where(a_norms == 0, 1e-8, a_norms)
30 b_norms = np.where(b_norms == 0, 1e-8, b_norms)
31
32 a_normalized = a / a_norms
33 b_normalized = b / b_norms
34
35 # Compute cosine similarity
36 return np.dot(a_normalized, b_normalized.T)
37
38def load_embeddings_from_file(filename, n_tokens, n_embd):
39 embeddings = np.fromfile(filename, dtype=np.float32)
40 # Check if this is pooled (single embedding) or per-token embeddings
41 if len(embeddings) == n_embd:
42 return embeddings.reshape(1, n_embd)
43 else:
44 return embeddings.reshape(n_tokens, n_embd)
45
46def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
47 np.set_printoptions(suppress=True, precision=6)
48 print("pytorch embeddings:");
49 print(python_emb)
50 print("llama.cpp embeddings:");
51 print(cpp_emb)
52 print(f"\n=== Prompt: '{prompt}' ===")
53 print(f"Tokens: {tokens}")
54 print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")
55
56 n_tokens = len(tokens)
57 is_pooled = python_emb.shape[0] == 1
58
59 if is_pooled:
60 print(f"\n[Pooled Embeddings Mode - comparing single sentence embeddings]")
61
62 # 1. Direct embedding comparison for pooled embeddings
63 print(f"\n1. Raw Embedding Magnitude Comparison:")
64 py_mag = np.linalg.norm(python_emb[0])
65 cpp_mag = np.linalg.norm(cpp_emb[0])
66 ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
67 print(f" Pooled embedding: Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
68
69 # 2. Cross-model similarity for pooled embeddings
70 print(f"\n2. Cross-Model Pooled Embedding Similarity:")
71 sim = cosine_similarity([python_emb[0]], [cpp_emb[0]])[0][0]
72 print(f" Cosine similarity: {sim:.6f}")
73
74 return {
75 'cross_model_similarities': [sim],
76 'similarity_matrix_diff': np.array([[0.0]]),
77 'max_diff': 0.0,
78 'mean_diff': 0.0,
79 'rms_diff': 0.0
80 }
81 else:
82 # Original per-token comparison logic
83 # 1. Direct embedding comparison
84 print(f"\n1. Raw Embedding Magnitude Comparison:")
85 # Check if the distance of each token embedding from the origin and compare
86 # if the vectors are on the same "sphere". This does not tell us about
87 # direction (meaning of the token embedding), just magnitude.
88 for i in range(n_tokens):
89 py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
90 cpp_mag = np.linalg.norm(cpp_emb[i]) # calculate standard euclidean norm for llama.cpp embeddings
91 ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
92 print(f" Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
93
94 # 2. Cosine similarity between tokens within each model
95 # Here we check the direction of token embeddings to see if the have the
96 # same meaning (similarity). This is done by calculating cosine similarity
97 # of a pair of token embeddings within each model.
98 print(f"\n2. Within-Model Token Similarities:")
99 print(" Python model:")
100 for i in range(n_tokens):
101 for j in range(i+1, n_tokens):
102 sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
103 print(f" {tokens[i]} โ {tokens[j]}: {sim:.4f}")
104
105 print(" llama.cpp model:")
106 for i in range(n_tokens):
107 for j in range(i+1, n_tokens):
108 sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
109 print(f" {tokens[i]} โ {tokens[j]}: {sim:.4f}")
110
111 # 3. Cross-model similarity (same token position)
112 print(f"\n3. Cross-Model Same-Token Similarities:")
113 for i in range(n_tokens):
114 sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
115 print(f" Token {i} ({tokens[i]}): {sim:.4f}")
116
117 # 4. Similarity matrix comparison
118 print(f"\n4. Similarity Matrix Differences:")
119 py_sim_matrix = cosine_similarity(python_emb)
120 cpp_sim_matrix = cosine_similarity(cpp_emb)
121 diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
122
123 print(f" Max difference: {np.max(diff_matrix):.4f}")
124 print(f" Mean difference: {np.mean(diff_matrix):.4f}")
125 print(f" RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
126
127 return {
128 'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
129 'similarity_matrix_diff': diff_matrix,
130 'max_diff': np.max(diff_matrix),
131 'mean_diff': np.mean(diff_matrix),
132 'rms_diff': np.sqrt(np.mean(diff_matrix**2))
133 }
134
135def read_prompt_from_file(file_path):
136 try:
137 with open(file_path, 'r', encoding='utf-8') as f:
138 return f.read().strip()
139 except FileNotFoundError:
140 print(f"Error: Prompts file '{file_path}' not found")
141 exit(1)
142 except Exception as e:
143 print(f"Error reading prompts file: {e}")
144 exit(1)
145
146def main():
147 parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
148 parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
149 parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file')
150 parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
151 parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
152 parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
153 parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts')
154
155 args = parser.parse_args()
156
157 if args.prompts_file:
158 prompt = read_prompt_from_file(args.prompts_file)
159 else:
160 prompt = args.prompt
161
162 python_emb_path = Path(args.python_embeddings)
163 cpp_emb_path = Path(args.cpp_embeddings)
164
165 # Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name")
166 python_model_name = python_emb_path.stem.replace("-embeddings", "")
167 cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "")
168
169 print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
170 print("=" * 70)
171
172 # First verify tokens match before comparing embeddings
173 print("\n๐ Token Comparison Check")
174 print("=" * 70)
175 data_dir = python_emb_path.parent
176 if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)):
177 exit_with_warning("\nโ Token mismatch detected", args.model_path)
178 print()
179
180 # Single prompt detailed comparison
181 print(f"\nTesting with prompt: '{prompt}'")
182
183 # Load the python model to get configuration information and also to load the tokenizer.
184 print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
185 tokenizer = AutoTokenizer.from_pretrained(args.model_path)
186 config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
187
188 if unreleased_model_name:
189 model_name_lower = unreleased_model_name.lower()
190 unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
191 if args.causal:
192 class_name = f"{unreleased_model_name}ForCausalLM"
193 else:
194 class_name = f"{unreleased_model_name}Model"
195 print(f"Model class: {class_name}")
196 print(f"Importing unreleased model module: {unreleased_module_path}")
197
198 try:
199 model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
200 model = model_class.from_pretrained(args.model_path)
201 except (ImportError, AttributeError) as e:
202 print(f"Failed to import or load model: {e}")
203 exit(1)
204 else:
205 if args.causal:
206 model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True)
207 else:
208 model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
209
210 encoded = tokenizer(prompt, return_tensors="pt")
211 tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
212 n_tokens = len(tokens)
213 print(f"n_tokens: {n_tokens}");
214 print(f"hidden_size: {model.config.hidden_size}")
215
216 # Load binary embeddings from data directory.
217 llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size)
218 python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
219
220 # Run comparison
221 results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt)
222
223 # Summary
224 print(f"\n=== SUMMARY ===")
225 avg_cross_sim = np.mean(results['cross_model_similarities'])
226 print(f"Average cross-model similarity: {avg_cross_sim:.4f}")
227 print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}")
228
229 # Quality assessment
230 if avg_cross_sim > 0.95:
231 print("โ
EXCELLENT: Models are highly similar")
232 elif avg_cross_sim > 0.90:
233 print("โ
VERY GOOD: Models are very similar")
234 elif avg_cross_sim > 0.80:
235 print("โ ๏ธ GOOD: Models are reasonably similar")
236 elif avg_cross_sim > 0.70:
237 print("โ ๏ธ FAIR: Models have some differences")
238 else:
239 exit_with_warning("โ POOR: Models are significantly different", args.model_path)
240
241if __name__ == "__main__":
242 main()