diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py')
| -rw-r--r-- | llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py | 242 |
1 files changed, 242 insertions, 0 deletions
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py b/llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py new file mode 100644 index 0000000..73e20ea --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 + +import numpy as np +import argparse +import os +import importlib +from pathlib import Path + +from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel +from common import compare_tokens, exit_with_warning # type: ignore[import-not-found] + +unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME') + +def cosine_similarity(a, b=None): + a = np.asarray(a) + if b is None: + b = a + else: + b = np.asarray(b) + + if a.ndim == 1: + a = a.reshape(1, -1) + if b.ndim == 1: + b = b.reshape(1, -1) + + a_norms = np.linalg.norm(a, axis=1, keepdims=True) + b_norms = np.linalg.norm(b, axis=1, keepdims=True) + + a_norms = np.where(a_norms == 0, 1e-8, a_norms) + b_norms = np.where(b_norms == 0, 1e-8, b_norms) + + a_normalized = a / a_norms + b_normalized = b / b_norms + + # Compute cosine similarity + return np.dot(a_normalized, b_normalized.T) + +def load_embeddings_from_file(filename, n_tokens, n_embd): + embeddings = np.fromfile(filename, dtype=np.float32) + # Check if this is pooled (single embedding) or per-token embeddings + if len(embeddings) == n_embd: + return embeddings.reshape(1, n_embd) + else: + return embeddings.reshape(n_tokens, n_embd) + +def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt): + np.set_printoptions(suppress=True, precision=6) + print("pytorch embeddings:"); + print(python_emb) + print("llama.cpp embeddings:"); + print(cpp_emb) + print(f"\n=== Prompt: '{prompt}' ===") + print(f"Tokens: {tokens}") + print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}") + + n_tokens = len(tokens) + is_pooled = python_emb.shape[0] == 1 + + if is_pooled: + print(f"\n[Pooled Embeddings Mode - comparing single sentence embeddings]") + + # 1. Direct embedding comparison for pooled embeddings + print(f"\n1. Raw Embedding Magnitude Comparison:") + py_mag = np.linalg.norm(python_emb[0]) + cpp_mag = np.linalg.norm(cpp_emb[0]) + ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf') + print(f" Pooled embedding: Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}") + + # 2. Cross-model similarity for pooled embeddings + print(f"\n2. Cross-Model Pooled Embedding Similarity:") + sim = cosine_similarity([python_emb[0]], [cpp_emb[0]])[0][0] + print(f" Cosine similarity: {sim:.6f}") + + return { + 'cross_model_similarities': [sim], + 'similarity_matrix_diff': np.array([[0.0]]), + 'max_diff': 0.0, + 'mean_diff': 0.0, + 'rms_diff': 0.0 + } + else: + # Original per-token comparison logic + # 1. Direct embedding comparison + print(f"\n1. Raw Embedding Magnitude Comparison:") + # Check if the distance of each token embedding from the origin and compare + # if the vectors are on the same "sphere". This does not tell us about + # direction (meaning of the token embedding), just magnitude. + for i in range(n_tokens): + py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings + cpp_mag = np.linalg.norm(cpp_emb[i]) # calculate standard euclidean norm for llama.cpp embeddings + ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf') + print(f" Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}") + + # 2. Cosine similarity between tokens within each model + # Here we check the direction of token embeddings to see if the have the + # same meaning (similarity). This is done by calculating cosine similarity + # of a pair of token embeddings within each model. + print(f"\n2. Within-Model Token Similarities:") + print(" Python model:") + for i in range(n_tokens): + for j in range(i+1, n_tokens): + sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0] + print(f" {tokens[i]} ā {tokens[j]}: {sim:.4f}") + + print(" llama.cpp model:") + for i in range(n_tokens): + for j in range(i+1, n_tokens): + sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0] + print(f" {tokens[i]} ā {tokens[j]}: {sim:.4f}") + + # 3. Cross-model similarity (same token position) + print(f"\n3. Cross-Model Same-Token Similarities:") + for i in range(n_tokens): + sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] + print(f" Token {i} ({tokens[i]}): {sim:.4f}") + + # 4. Similarity matrix comparison + print(f"\n4. Similarity Matrix Differences:") + py_sim_matrix = cosine_similarity(python_emb) + cpp_sim_matrix = cosine_similarity(cpp_emb) + diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix) + + print(f" Max difference: {np.max(diff_matrix):.4f}") + print(f" Mean difference: {np.mean(diff_matrix):.4f}") + print(f" RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}") + + return { + 'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)], + 'similarity_matrix_diff': diff_matrix, + 'max_diff': np.max(diff_matrix), + 'mean_diff': np.mean(diff_matrix), + 'rms_diff': np.sqrt(np.mean(diff_matrix**2)) + } + +def read_prompt_from_file(file_path): + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read().strip() + except FileNotFoundError: + print(f"Error: Prompts file '{file_path}' not found") + exit(1) + except Exception as e: + print(f"Error reading prompts file: {e}") + exit(1) + +def main(): + parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings') + parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model') + parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file') + parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file') + parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true') + parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt') + parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts') + + args = parser.parse_args() + + if args.prompts_file: + prompt = read_prompt_from_file(args.prompts_file) + else: + prompt = args.prompt + + python_emb_path = Path(args.python_embeddings) + cpp_emb_path = Path(args.cpp_embeddings) + + # Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name") + python_model_name = python_emb_path.stem.replace("-embeddings", "") + cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "") + + print("Semantic Similarity Test Between Python and llama.cpp Embedding Models") + print("=" * 70) + + # First verify tokens match before comparing embeddings + print("\nš Token Comparison Check") + print("=" * 70) + data_dir = python_emb_path.parent + if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)): + exit_with_warning("\nā Token mismatch detected", args.model_path) + print() + + # Single prompt detailed comparison + print(f"\nTesting with prompt: '{prompt}'") + + # Load the python model to get configuration information and also to load the tokenizer. + print("Loading model and tokenizer using AutoTokenizer:", args.model_path) + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True) + + if unreleased_model_name: + model_name_lower = unreleased_model_name.lower() + unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + if args.causal: + class_name = f"{unreleased_model_name}ForCausalLM" + else: + class_name = f"{unreleased_model_name}Model" + print(f"Model class: {class_name}") + print(f"Importing unreleased model module: {unreleased_module_path}") + + try: + model_class = getattr(importlib.import_module(unreleased_module_path), class_name) + model = model_class.from_pretrained(args.model_path) + except (ImportError, AttributeError) as e: + print(f"Failed to import or load model: {e}") + exit(1) + else: + if args.causal: + model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True) + else: + model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True) + + encoded = tokenizer(prompt, return_tensors="pt") + tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) + n_tokens = len(tokens) + print(f"n_tokens: {n_tokens}"); + print(f"hidden_size: {model.config.hidden_size}") + + # Load binary embeddings from data directory. + llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size) + python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size) + + # Run comparison + results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt) + + # Summary + print(f"\n=== SUMMARY ===") + avg_cross_sim = np.mean(results['cross_model_similarities']) + print(f"Average cross-model similarity: {avg_cross_sim:.4f}") + print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}") + + # Quality assessment + if avg_cross_sim > 0.95: + print("ā
EXCELLENT: Models are highly similar") + elif avg_cross_sim > 0.90: + print("ā
VERY GOOD: Models are very similar") + elif avg_cross_sim > 0.80: + print("ā ļø GOOD: Models are reasonably similar") + elif avg_cross_sim > 0.70: + print("ā ļø FAIR: Models have some differences") + else: + exit_with_warning("ā POOR: Models are significantly different", args.model_path) + +if __name__ == "__main__": + main() |
