summaryrefslogtreecommitdiff
path: root/llama.cpp/examples/model-conversion/scripts/utils
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/examples/model-conversion/scripts/utils')
-rw-r--r--llama.cpp/examples/model-conversion/scripts/utils/__init__.py0
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/check-nmse.py177
-rw-r--r--llama.cpp/examples/model-conversion/scripts/utils/common.py299
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/compare_tokens.py76
-rw-r--r--llama.cpp/examples/model-conversion/scripts/utils/create-collection-add-model.sh8
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/curl-embedding-server.sh6
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py80
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/hf-create-collection.py106
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/hf-create-model.py78
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py58
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/inspect-converted-model.sh14
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/inspect-org-model.py67
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/perplexity-gen.sh40
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/perplexity-run-simple.sh32
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/perplexity-run.sh33
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/quantize.sh53
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/run-embedding-server.sh27
-rw-r--r--llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py242
-rwxr-xr-xllama.cpp/examples/model-conversion/scripts/utils/tensor-info.py159
19 files changed, 1555 insertions, 0 deletions
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/__init__.py b/llama.cpp/examples/model-conversion/scripts/utils/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/__init__.py
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/check-nmse.py b/llama.cpp/examples/model-conversion/scripts/utils/check-nmse.py
new file mode 100755
index 0000000..83f63f9
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/check-nmse.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import sys
+import os
+import argparse
+from pathlib import Path
+from common import get_model_name_from_env_path # type: ignore[import-not-found]
+
+def calculate_nmse(reference, test):
+ mse = np.mean((test - reference) ** 2)
+ ref_var = np.var(reference)
+ if ref_var == 0:
+ nmse = float('inf') if mse > 0 else 0.0
+ return mse, mse, ref_var
+
+ nmse = mse / ref_var
+
+ return nmse, mse, ref_var
+
+def load_logits(file_path):
+ if not os.path.exists(file_path):
+ raise FileNotFoundError(f"File not found: {file_path}")
+
+ if file_path.suffix == '.npy':
+ return np.load(file_path)
+ elif file_path.suffix == '.bin':
+ return np.fromfile(file_path, dtype=np.float32)
+ else:
+ # Try to load as text file
+ try:
+ # If it has index format "0: value", extract just values
+ data = []
+ with open(file_path, 'r') as f:
+ for line in f:
+ if ':' in line:
+ # Format: "index: value"
+ value = float(line.split(':')[1].strip())
+ else:
+ # Just the value
+ value = float(line.strip())
+ data.append(value)
+ return np.array(data, dtype=np.float32)
+ except:
+ return np.loadtxt(file_path, dtype=np.float32)
+
+def interpret_nmse(nmse):
+ """Provide interpretation of NMSE value"""
+ if nmse == 0:
+ return "Perfect match", "šŸŽ‰"
+ elif nmse < 1e-6:
+ return "Essentially identical", "āœ…"
+ elif nmse < 1e-4:
+ return "Excellent match", "āœ…"
+ elif nmse < 1e-3:
+ return "Very good match", "šŸ‘"
+ elif nmse < 1e-2:
+ return "Good match", "šŸ‘"
+ elif nmse < 0.1:
+ return "Acceptable match", "āš ļø"
+ elif nmse < 1.0:
+ return "Poor match", "āŒ"
+ else:
+ return "Very poor match (worse than noise)", "āŒ"
+
+def main():
+ parser = argparse.ArgumentParser(description='Validate model logits')
+ parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
+ args = parser.parse_args()
+
+ model_name = get_model_name_from_env_path('MODEL_PATH')
+ data_dir = Path("data")
+
+ pytorch_file = data_dir / f"pytorch-{model_name}.bin"
+
+ llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL')
+ llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin"
+
+ print(f"Model name: {model_name}")
+ print(f"PyTorch logits file: {pytorch_file}")
+ print(f"llama.cpp logits file: {llamacpp_file}")
+
+ reference_file = pytorch_file
+ test_file = llamacpp_file
+
+ print("šŸ“Š NMSE Check for Model Comparison")
+ print("=" * 50)
+ print(f"Reference (ground truth): {reference_file}")
+ print(f"Test (to evaluate): {test_file}")
+ print()
+
+ try:
+ print("Loading reference logits...")
+ reference = load_logits(reference_file)
+ print(f" Shape: {reference.shape}, Type: {reference.dtype}")
+
+ print("Loading test logits...")
+ test = load_logits(test_file)
+ print(f" Shape: {test.shape}, Type: {test.dtype}")
+
+ # Check shapes match
+ if reference.shape != test.shape:
+ print(f"\nāŒ Error: Shape mismatch!")
+ print(f" Reference: {reference.shape}")
+ print(f" Test: {test.shape}")
+ sys.exit(1)
+
+ print(f"\nāœ… Shapes match: {reference.shape}")
+
+ nmse, mse, ref_var = calculate_nmse(reference, test)
+
+ # Additional metrics
+ max_abs_error = np.max(np.abs(test - reference))
+ mean_abs_error = np.mean(np.abs(test - reference))
+
+ # Results
+ print(f"\nšŸ“ˆ METRICS")
+ print("=" * 30)
+ print(f"MSE (Mean Squared Error): {mse:.6e}")
+ print(f"Reference Variance: {ref_var:.6e}")
+ print(f"NMSE: {nmse:.6e}")
+ print(f"Max Absolute Error: {max_abs_error:.6f}")
+ print(f"Mean Absolute Error: {mean_abs_error:.6f}")
+
+ # NMSE in dB (common in signal processing)
+ if nmse > 0:
+ nmse_db = 10 * np.log10(nmse)
+ print(f"NMSE (dB): {nmse_db:.2f} dB")
+
+ # Interpretation
+ interpretation, emoji = interpret_nmse(nmse)
+ print(f"\nšŸŽÆ INTERPRETATION")
+ print("=" * 30)
+ print(f"{emoji} {interpretation}")
+
+ # Detailed guidance
+ print(f"\nšŸ“‹ GUIDANCE")
+ print("=" * 30)
+ if nmse < 1e-3:
+ print("āœ… EXCELLENT: Your GGML conversion is working very well!")
+ print(" The differences are negligible for practical use.")
+ elif nmse < 1e-2:
+ print("šŸ‘ GOOD: Your GGML conversion is working well.")
+ print(" Small differences are likely due to precision/quantization.")
+ elif nmse < 0.1:
+ print("āš ļø ACCEPTABLE: Conversion is working but with some differences.")
+ print(" Check if you're using quantization (Q4, Q8, etc.)")
+ print(" Test generation quality to see if it's acceptable.")
+ else:
+ print("āŒ PROBLEMATIC: Large differences detected.")
+ print(" Check your conversion process for potential issues.")
+ print(" Verify you're using the same model weights.")
+
+ # NMSE benchmarks
+ print(f"\nšŸ“š NMSE BENCHMARKS")
+ print("=" * 30)
+ print("< 1e-6: Essentially identical")
+ print("< 1e-4: Excellent (typical for good conversions)")
+ print("< 1e-3: Very good")
+ print("< 1e-2: Good (acceptable for most use cases)")
+ print("< 0.1: Acceptable (may need verification)")
+ print("> 1.0: Poor (worse than random)")
+
+ # Exit code based on NMSE
+ if nmse < 1e-2:
+ print(f"\nāœ… RESULT: PASS (NMSE = {nmse:.2e})")
+ sys.exit(0)
+ else:
+ print(f"\nāŒ RESULT: NEEDS REVIEW (NMSE = {nmse:.2e})")
+ sys.exit(1)
+
+ except Exception as e:
+ print(f"āŒ Error: {e}")
+ sys.exit(1)
+
+if __name__ == "__main__":
+ main()
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/common.py b/llama.cpp/examples/model-conversion/scripts/utils/common.py
new file mode 100644
index 0000000..aa4bab2
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/common.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import torch
+import transformers
+import json
+import textwrap
+import numpy as np
+from pathlib import Path
+
+
+def get_model_name_from_env_path(env_path_name):
+ model_path = os.getenv(env_path_name)
+ if not model_path:
+ print(f"Error: {env_path_name} environment variable not set")
+ sys.exit(1)
+
+ if not os.path.exists(model_path):
+ print(f"Error: Model file not found: {model_path}")
+ sys.exit(1)
+
+ name = os.path.basename(os.path.normpath(model_path))
+ if name.endswith(".gguf"):
+ name = name[:-5]
+
+ return name
+
+
+def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
+ """
+ Print a tensor in llama.cpp debug style.
+
+ Supports:
+ - 2D tensors (seq, hidden)
+ - 3D tensors (batch, seq, hidden)
+ - 4D tensors (batch, seq, heads, dim_per_head) via flattening heads Ɨ dim_per_head
+
+ Shows first and last max_vals of each vector per sequence position.
+ """
+ t = tensor.detach().to(torch.float32).cpu()
+
+ # Determine dimensions
+ if t.ndim == 3:
+ _, s, _ = t.shape
+ elif t.ndim == 2:
+ _, s = 1, t.shape[0]
+ t = t.unsqueeze(0)
+ elif t.ndim == 4:
+ _, s, _, _ = t.shape
+ else:
+ print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
+ return
+
+ ten_shape = t.shape
+
+ print(f"ggml_debug: {name} = (f32) ... = {{{ten_shape}}}")
+ print(" [")
+ print(" [")
+
+ # Determine indices for first and last sequences
+ first_indices = list(range(min(s, max_seq)))
+ last_indices = list(range(max(0, s - max_seq), s))
+
+ # Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
+ has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
+
+ # Combine indices
+ if has_overlap:
+ # If there's overlap, just use the combined unique indices
+ indices = sorted(list(set(first_indices + last_indices)))
+ separator_index = None
+ else:
+ # If no overlap, we'll add a separator between first and last sequences
+ indices = first_indices + last_indices
+ separator_index = len(first_indices)
+
+ for i, si in enumerate(indices):
+ # Add separator if needed
+ if separator_index is not None and i == separator_index:
+ print(" ...")
+
+ # Extract appropriate slice
+ vec = t[0, si]
+ if vec.ndim == 2: # 4D case: flatten heads Ɨ dim_per_head
+ flat = vec.flatten().tolist()
+ else: # 2D or 3D case
+ flat = vec.tolist()
+
+ # First and last slices
+ first = flat[:max_vals]
+ last = flat[-max_vals:] if len(flat) >= max_vals else flat
+ first_str = ", ".join(f"{v:12.4f}" for v in first)
+ last_str = ", ".join(f"{v:12.4f}" for v in last)
+
+ print(f" [{first_str}, ..., {last_str}]")
+
+ print(" ],")
+ print(" ]")
+ print(f" sum = {t.sum().item():.6f}\n")
+
+
+def debug_hook(name):
+ def fn(_m, input, output):
+ if isinstance(input, torch.Tensor):
+ summarize(input, name + "_in")
+ elif isinstance(input, (tuple, list)) and len(input) > 0 and isinstance(input[0], torch.Tensor):
+ summarize(input[0], name + "_in")
+ if isinstance(output, torch.Tensor):
+ summarize(output, name + "_out")
+ elif isinstance(output, (tuple, list)) and len(output) > 0 and isinstance(output[0], torch.Tensor):
+ summarize(output[0], name + "_out")
+
+ return fn
+
+
+def setup_rope_debug(model_module_path: str, function_name: str = "apply_rotary_pos_emb"):
+ """
+ Apply monkey patch to dump RoPE activations for debugging.
+
+ Args:
+ model_module_path: Path to the model module (e.g., "transformers.models.apertus.modeling_apertus")
+ function_name: Name of the RoPE function to patch (default: "apply_rotary_pos_emb")
+
+ Example:
+ from utils.common import setup_rope_debug
+ setup_rope_debug("transformers.models.apertus.modeling_apertus")
+ """
+ import importlib
+
+ # Import the module and get the original function
+ module = importlib.import_module(model_module_path)
+ orig_rope = getattr(module, function_name)
+
+ # Set torch print options for better debugging
+ torch.set_printoptions(threshold=float('inf'))
+ torch.set_printoptions(precision=6, sci_mode=False)
+
+ def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+ # log inputs
+ summarize(q, "RoPE.q_in")
+ summarize(k, "RoPE.k_in")
+
+ # call original
+ q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim)
+
+ # log outputs
+ summarize(q_out, "RoPE.q_out")
+ summarize(k_out, "RoPE.k_out")
+
+ return q_out, k_out
+
+ # Patch it
+ setattr(module, function_name, debug_rope)
+ print(f"RoPE debug patching applied to {model_module_path}.{function_name}")
+
+
+def save_output_data(data, tokens, prompt, model_name, type_suffix="", output_dir="data"):
+ """
+ Save output data (logits/embeddings), tokens, and prompt to files.
+
+ Args:
+ data: numpy array of floats (logits or embeddings)
+ tokens: list or array of token IDs
+ prompt: string containing the input prompt
+ model_name: name of the model
+ type_suffix: optional suffix like "-embeddings" (default: "")
+ output_dir: directory to save files (default: "data")
+
+ Creates the following files in output_dir:
+ - pytorch-{model_name}{type_suffix}.bin
+ - pytorch-{model_name}{type_suffix}.txt
+ - pytorch-{model_name}{type_suffix}-prompt.txt
+ - pytorch-{model_name}{type_suffix}-tokens.bin
+ """
+ data_dir = Path(output_dir)
+ data_dir.mkdir(exist_ok=True)
+ base_path = data_dir / f"pytorch-{model_name}{type_suffix}"
+
+ # Convert and flatten logits/embeddings
+ data = data.cpu().numpy() if isinstance(data, torch.Tensor) else np.asarray(data)
+ data = data.flatten() if data.ndim > 1 else data
+
+ # Save logits/embedding files
+ data.astype(np.float32).tofile(f"{base_path}.bin")
+ print(f"Data saved to {base_path}.bin")
+
+ with open(f"{base_path}.txt", "w") as f:
+ f.writelines(f"{i}: {value:.6f}\n" for i, value in enumerate(data))
+ print(f"Data saved to {base_path}.txt")
+
+ # Convert and flatten tokens
+ tokens = tokens.cpu().numpy() if isinstance(tokens, torch.Tensor) else np.asarray(tokens)
+ tokens = tokens.flatten() if tokens.ndim > 1 else tokens
+
+ # Save token binary file
+ tokens.astype(np.int32).tofile(f"{base_path}-tokens.bin")
+ print(f"Tokens saved to {base_path}-tokens.bin")
+
+ # Save prompt file
+ with open(f"{base_path}-prompt.txt", "w") as f:
+ f.write(f"prompt: {prompt}\n")
+ f.write(f"n_tokens: {len(tokens)}\n")
+ f.write(f"token ids: {', '.join(str(int(tid)) for tid in tokens)}\n")
+ print(f"Prompt saved to {base_path}-prompt.txt")
+
+
+def compare_tokens(original, converted, type_suffix="", output_dir="data"):
+ data_dir = Path(output_dir)
+
+ # Read tokens from both models
+ tokens1_file = data_dir / f"{original}{type_suffix}-tokens.bin"
+ tokens2_file = data_dir / f"{converted}{type_suffix}-tokens.bin"
+
+ if not tokens1_file.exists():
+ print(f"Error: Token file not found: {tokens1_file}")
+ return False
+
+ if not tokens2_file.exists():
+ print(f"Error: Token file not found: {tokens2_file}")
+ return False
+
+ tokens1 = np.fromfile(tokens1_file, dtype=np.int32)
+ tokens2 = np.fromfile(tokens2_file, dtype=np.int32)
+
+ print(f"\nComparing tokens between:")
+ print(f" Original : {original} ({len(tokens1)} tokens)")
+ print(f" Converted: {converted} ({len(tokens2)} tokens)")
+
+ if len(tokens1) != len(tokens2):
+ print(f"\nāŒ Token count mismatch: {len(tokens1)} vs {len(tokens2)}")
+ return False
+
+ if np.array_equal(tokens1, tokens2):
+ print(f"\nāœ… All {len(tokens1)} tokens match!")
+ return True
+
+ mismatches = np.where(tokens1 != tokens2)[0]
+ print(f"\nāŒ Found {len(mismatches)} mismatched tokens:")
+
+ num_to_show = min(len(mismatches), 10)
+ for idx in mismatches[:num_to_show]:
+ print(f" Position {idx}: {tokens1[idx]} vs {tokens2[idx]}")
+
+ if len(mismatches) > num_to_show:
+ print(f" ... and {len(mismatches) - num_to_show} more mismatches")
+
+ return False
+
+
+def show_version_warning(current_version, model_version):
+ if not model_version:
+ return False
+
+ try:
+ from packaging.version import parse, InvalidVersion
+ try:
+ return parse(current_version) < parse(model_version)
+ except InvalidVersion:
+ return current_version != model_version
+ except ImportError:
+ return current_version != model_version
+
+def get_model_transformers_version(model_path):
+ if not model_path:
+ return None
+
+ config_path = Path(model_path) / "config.json"
+ if not config_path.is_file():
+ return None
+
+ try:
+ with open(config_path, "r", encoding="utf-8") as f:
+ config = json.load(f)
+ return config.get("transformers_version")
+ except (IOError, json.JSONDecodeError) as e:
+ print(f"Warning: Could not read or parse {config_path}: {e}", file=sys.stderr)
+ return None
+
+def exit_with_warning(message, model_path):
+ print(message)
+
+ if model_path and transformers is not None:
+ model_transformers_version = get_model_transformers_version(model_path)
+ transformers_version = transformers.__version__
+ if show_version_warning(transformers_version, model_transformers_version):
+ warning_message = f"""
+ =====================================================================
+ Verification failure might be due to a transformers version mismatch:
+
+ Current transformers version: {transformers_version}
+ Model's required version : {model_transformers_version}
+
+ Consider installing the version specified by the model's config:
+ pip install transformers=={model_transformers_version}
+ =====================================================================
+ """
+ print(textwrap.dedent(warning_message))
+ sys.exit(1)
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/compare_tokens.py b/llama.cpp/examples/model-conversion/scripts/utils/compare_tokens.py
new file mode 100755
index 0000000..a286cb5
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/compare_tokens.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+
+import argparse
+import sys
+from common import compare_tokens # type: ignore
+
+
+def parse_arguments():
+ parser = argparse.ArgumentParser(
+ description='Compare tokens between two models',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ %(prog)s pytorch-gemma-3-270m-it llamacpp-gemma-3-270m-it-bf16
+ """
+ )
+ parser.add_argument(
+ 'original',
+ help='Original model name'
+ )
+ parser.add_argument(
+ 'converted',
+ help='Converted model name'
+ )
+ parser.add_argument(
+ '-s', '--suffix',
+ default='',
+ help='Type suffix (e.g., "-embeddings")'
+ )
+ parser.add_argument(
+ '-d', '--data-dir',
+ default='data',
+ help='Directory containing token files (default: data)'
+ )
+ parser.add_argument(
+ '-v', '--verbose',
+ action='store_true',
+ help='Print prompts from both models'
+ )
+ return parser.parse_args()
+
+
+def main():
+ args = parse_arguments()
+
+ if args.verbose:
+ from pathlib import Path
+ data_dir = Path(args.data_dir)
+
+ prompt1_file = data_dir / f"{args.original}{args.suffix}-prompt.txt"
+ prompt2_file = data_dir / f"{args.converted}{args.suffix}-prompt.txt"
+
+ if prompt1_file.exists():
+ print(f"\nOriginal model prompt ({args.original}):")
+ print(f" {prompt1_file.read_text().strip()}")
+
+ if prompt2_file.exists():
+ print(f"\nConverted model prompt ({args.converted}):")
+ print(f" {prompt2_file.read_text().strip()}")
+
+ print()
+
+ result = compare_tokens(
+ args.original,
+ args.converted,
+ type_suffix=args.suffix,
+ output_dir=args.data_dir
+ )
+
+ # Enable the script to be used in shell scripts so that they can check
+ # the exit code for success/failure.
+ sys.exit(0 if result else 1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/create-collection-add-model.sh b/llama.cpp/examples/model-conversion/scripts/utils/create-collection-add-model.sh
new file mode 100644
index 0000000..485001b
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/create-collection-add-model.sh
@@ -0,0 +1,8 @@
+
+#!/usr/bin/env bash
+
+COLLECTION_SLUG=$(python ./create_collection.py --return-slug)
+echo "Created collection: $COLLECTION_SLUG"
+
+# Use it in the next command
+python add_model_to_collection.py "$COLLECTION_SLUG" "username/my-model"
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/curl-embedding-server.sh b/llama.cpp/examples/model-conversion/scripts/utils/curl-embedding-server.sh
new file mode 100755
index 0000000..7ed69e1
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/curl-embedding-server.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+curl --request POST \
+ --url http://localhost:8080/embedding \
+ --header "Content-Type: application/json" \
+ --data '{"input": "Hello world today"}' \
+ --silent
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py b/llama.cpp/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
new file mode 100755
index 0000000..7e38af3
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+from huggingface_hub import HfApi
+import argparse
+import sys
+
+def add_model_to_collection(collection_slug, model_id, note=""):
+ """
+ Add a model to an existing collection
+
+ Args:
+ collection_slug: The slug of the collection (e.g., "username/collection-name-12345")
+ model_id: The model repository ID (e.g., "username/model-name")
+ note: Optional note about the model
+
+ Returns:
+ True if successful, False if failed
+ """
+
+ # Initialize API
+ api = HfApi()
+
+ try:
+ user_info = api.whoami()
+ print(f"āœ… Authenticated as: {user_info['name']}")
+
+ # Verify the model exists
+ print(f"šŸ” Checking if model exists: {model_id}")
+ try:
+ model_info = api.model_info(model_id)
+ except Exception as e:
+ print(f"āŒ Model not found or not accessible: {model_id}")
+ print(f"Error: {e}")
+ return False
+
+ print(f"šŸ“š Adding model to collection...")
+ api.add_collection_item(
+ collection_slug=collection_slug,
+ item_id=model_id,
+ item_type="model",
+ note=note
+ )
+
+ print(f"āœ… Model added to collection successfully!")
+ print(f"šŸ”— Collection URL: https://huggingface.co/collections/{collection_slug}")
+
+ return True
+
+ except Exception as e:
+ print(f"āŒ Error adding model to collection: {e}")
+ return False
+
+def main():
+ # This script requires that the environment variable HF_TOKEN is set with your
+ # Hugging Face API token.
+ api = HfApi()
+
+ parser = argparse.ArgumentParser(description='Add model to a Huggingface Collection')
+ parser.add_argument('--collection', '-c', help='The collection slug username/collection-hash', required=True)
+ parser.add_argument('--model', '-m', help='The model to add to the Collection', required=True)
+ parser.add_argument('--note', '-n', help='An optional note/description', required=False)
+ args = parser.parse_args()
+
+ collection = args.collection
+ model = args.model
+ note = args.note
+
+ success = add_model_to_collection(
+ collection_slug=collection,
+ model_id=model,
+ note=note
+ )
+
+ if success:
+ print("\nšŸŽ‰ Model added successfully!")
+ else:
+ print("\nāŒ Failed to add model to collection")
+ sys.exit(1)
+if __name__ == "__main__":
+ main()
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/hf-create-collection.py b/llama.cpp/examples/model-conversion/scripts/utils/hf-create-collection.py
new file mode 100755
index 0000000..e0fa60a
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/hf-create-collection.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+from huggingface_hub import HfApi
+import argparse
+import os
+import sys
+
+
+def create_collection(title, description, private=False, namespace=None, return_slug=False):
+ """
+ Create a new collection on Hugging Face
+
+ Args:
+ title: Collection title
+ description: Collection description
+ private: Whether the collection should be private (default: False)
+ namespace: Optional namespace (defaults to your username)
+
+ Returns:
+ Collection object if successful, None if failed
+ """
+
+ # Check if HF_TOKEN is available
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+ if not token:
+ print("āŒ No HF_TOKEN or HUGGINGFACE_HUB_TOKEN found in environment variables")
+ print("Please set your Hugging Face token as an environment variable")
+ return None
+
+ # Initialize API
+ api = HfApi()
+
+ try:
+ # Test authentication first
+ user_info = api.whoami()
+ if not return_slug:
+ print(f"āœ… Authenticated as: {user_info['name']}")
+
+ # Create the collection
+ if not return_slug:
+ print(f"šŸ“š Creating collection: '{title}'...")
+ collection = api.create_collection(
+ title=title,
+ description=description,
+ private=private,
+ namespace=namespace
+ )
+
+ if not return_slug:
+ print(f"āœ… Collection created successfully!")
+ print(f"šŸ“‹ Collection slug: {collection.slug}")
+ print(f"šŸ”— Collection URL: https://huggingface.co/collections/{collection.slug}")
+
+ return collection
+
+ except Exception as e:
+ print(f"āŒ Error creating collection: {e}")
+ return None
+
+def main():
+ # This script requires that the environment variable HF_TOKEN is set with your
+ # Hugging Face API token.
+ api = HfApi()
+
+ parser = argparse.ArgumentParser(description='Create a Huggingface Collection')
+ parser.add_argument('--name', '-n', help='The name/title of the Collection', required=True)
+ parser.add_argument('--description', '-d', help='The description for the Collection', required=True)
+ parser.add_argument('--namespace', '-ns', help='The namespace to add the Collection to', required=True)
+ parser.add_argument('--private', '-p', help='Create a private Collection', action='store_true') # Fixed
+ parser.add_argument('--return-slug', '-s', help='Only output the collection slug', action='store_true') # Fixed
+
+ args = parser.parse_args()
+
+ name = args.name
+ description = args.description
+ private = args.private
+ namespace = args.namespace
+ return_slug = args.return_slug
+
+ if not return_slug:
+ print("šŸš€ Creating Hugging Face Collection")
+ print(f"Title: {name}")
+ print(f"Description: {description}")
+ print(f"Namespace: {namespace}")
+ print(f"Private: {private}")
+
+ collection = create_collection(
+ title=name,
+ description=description,
+ private=private,
+ namespace=namespace,
+ return_slug=return_slug
+ )
+
+ if collection:
+ if return_slug:
+ print(collection.slug)
+ else:
+ print("\nšŸŽ‰ Collection created successfully!")
+ print(f"Use this slug to add models: {collection.slug}")
+ else:
+ print("\nāŒ Failed to create collection")
+ sys.exit(1)
+
+if __name__ == "__main__":
+ main()
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/hf-create-model.py b/llama.cpp/examples/model-conversion/scripts/utils/hf-create-model.py
new file mode 100755
index 0000000..ea99bd8
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/hf-create-model.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+from huggingface_hub import HfApi
+import argparse
+
+# This script requires that the environment variable HF_TOKEN is set with your
+# Hugging Face API token.
+api = HfApi()
+
+def load_template_and_substitute(template_path, **kwargs):
+ try:
+ with open(template_path, 'r', encoding='utf-8') as f:
+ template_content = f.read()
+
+ return template_content.format(**kwargs)
+ except FileNotFoundError:
+ print(f"Template file '{template_path}' not found!")
+ return None
+ except KeyError as e:
+ print(f"Missing template variable: {e}")
+ return None
+
+parser = argparse.ArgumentParser(description='Create a new Hugging Face model repository')
+parser.add_argument('--model-name', '-m', help='Name for the model', required=True)
+parser.add_argument('--namespace', '-ns', help='Namespace to add the model to', required=True)
+parser.add_argument('--org-base-model', '-b', help='Original Base model name', default="")
+parser.add_argument('--no-card', action='store_true', help='Skip creating model card')
+parser.add_argument('--private', '-p', action='store_true', help='Create private model')
+parser.add_argument('--embedding', '-e', action='store_true', help='Use embedding model card template')
+parser.add_argument('--dry-run', '-d', action='store_true', help='Print repository info and template without creating repository')
+
+args = parser.parse_args()
+
+repo_id = f"{args.namespace}/{args.model_name}-GGUF"
+print("Repository ID: ", repo_id)
+
+repo_url = None
+if not args.dry_run:
+ repo_url = api.create_repo(
+ repo_id=repo_id,
+ repo_type="model",
+ private=args.private,
+ exist_ok=False
+ )
+
+if not args.no_card:
+ if args.embedding:
+ template_path = "scripts/embedding/modelcard.template"
+ else:
+ template_path = "scripts/causal/modelcard.template"
+
+ print("Template path: ", template_path)
+
+ model_card_content = load_template_and_substitute(
+ template_path,
+ model_name=args.model_name,
+ namespace=args.namespace,
+ base_model=args.org_base_model,
+ )
+
+ if args.dry_run:
+ print("\nTemplate Content:\n")
+ print(model_card_content)
+ else:
+ if model_card_content:
+ api.upload_file(
+ path_or_fileobj=model_card_content.encode('utf-8'),
+ path_in_repo="README.md",
+ repo_id=repo_id
+ )
+ print("Model card created successfully.")
+ else:
+ print("Failed to create model card.")
+
+if not args.dry_run and repo_url:
+ print(f"Repository created: {repo_url}")
+
+
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py b/llama.cpp/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
new file mode 100755
index 0000000..15ccb11
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+from huggingface_hub import HfApi
+import argparse
+import os
+
+def upload_gguf_file(local_file_path, repo_id, filename_in_repo=None):
+ """
+ Upload a GGUF file to a Hugging Face model repository
+
+ Args:
+ local_file_path: Path to your local GGUF file
+ repo_id: Your repository ID (e.g., "username/model-name")
+ filename_in_repo: Optional custom name for the file in the repo
+ """
+
+ if not os.path.exists(local_file_path):
+ print(f"āŒ File not found: {local_file_path}")
+ return False
+
+ if filename_in_repo is None:
+ filename_in_repo = os.path.basename(local_file_path)
+
+ if filename_in_repo is None or filename_in_repo == "":
+ filename_in_repo = os.path.basename(local_file_path)
+
+ print(f"šŸ“¤ Uploading {local_file_path} to {repo_id}/{filename_in_repo}")
+
+ api = HfApi()
+
+ try:
+ api.upload_file(
+ path_or_fileobj=local_file_path,
+ path_in_repo=filename_in_repo,
+ repo_id=repo_id,
+ repo_type="model",
+ commit_message=f"Upload {filename_in_repo}"
+ )
+
+ print("āœ… Upload successful!")
+ print(f"šŸ”— File available at: https://huggingface.co/{repo_id}/blob/main/{filename_in_repo}")
+ return True
+
+ except Exception as e:
+ print(f"āŒ Upload failed: {e}")
+ return False
+
+# This script requires that the environment variable HF_TOKEN is set with your
+# Hugging Face API token.
+api = HfApi()
+
+parser = argparse.ArgumentParser(description='Upload a GGUF model to a Huggingface model repository')
+parser.add_argument('--gguf-model-path', '-m', help='The GGUF model file to upload', required=True)
+parser.add_argument('--repo-id', '-r', help='The repository to upload to', required=True)
+parser.add_argument('--name', '-o', help='The name in the model repository', required=False)
+args = parser.parse_args()
+
+upload_gguf_file(args.gguf_model_path, args.repo_id, args.name)
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/inspect-converted-model.sh b/llama.cpp/examples/model-conversion/scripts/utils/inspect-converted-model.sh
new file mode 100755
index 0000000..32d8482
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/inspect-converted-model.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+# First try command line argument, then environment variable, then file
+CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+
+# Final check if we have a model path
+if [ -z "$CONVERTED_MODEL" ]; then
+ echo "Error: Model path must be provided either as:" >&2
+ echo " 1. Command line argument" >&2
+ echo " 2. CONVERTED_MODEL environment variable" >&2
+ exit 1
+fi
+
+../../gguf-py/gguf/scripts/gguf_dump.py $CONVERTED_MODEL
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/inspect-org-model.py b/llama.cpp/examples/model-conversion/scripts/utils/inspect-org-model.py
new file mode 100755
index 0000000..bc6f45a
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/inspect-org-model.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import json
+from safetensors import safe_open
+from collections import defaultdict
+
+parser = argparse.ArgumentParser(description='Process model with specified path')
+parser.add_argument('--model-path', '-m', help='Path to the model')
+args = parser.parse_args()
+
+model_path = os.environ.get('MODEL_PATH', args.model_path)
+if model_path is None:
+ parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
+
+# Check if there's an index file (multi-file model)
+index_path = os.path.join(model_path, "model.safetensors.index.json")
+single_file_path = os.path.join(model_path, "model.safetensors")
+
+if os.path.exists(index_path):
+ # Multi-file model
+ print("Multi-file model detected")
+
+ with open(index_path, 'r') as f:
+ index_data = json.load(f)
+
+ # Get the weight map (tensor_name -> file_name)
+ weight_map = index_data.get("weight_map", {})
+
+ # Group tensors by file for efficient processing
+ file_tensors = defaultdict(list)
+ for tensor_name, file_name in weight_map.items():
+ file_tensors[file_name].append(tensor_name)
+
+ print("Tensors in model:")
+
+ # Process each shard file
+ for file_name, tensor_names in file_tensors.items():
+ file_path = os.path.join(model_path, file_name)
+ print(f"\n--- From {file_name} ---")
+
+ with safe_open(file_path, framework="pt") as f:
+ for tensor_name in sorted(tensor_names):
+ tensor = f.get_tensor(tensor_name)
+ print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
+
+elif os.path.exists(single_file_path):
+ # Single file model (original behavior)
+ print("Single-file model detected")
+
+ with safe_open(single_file_path, framework="pt") as f:
+ keys = f.keys()
+ print("Tensors in model:")
+ for key in sorted(keys):
+ tensor = f.get_tensor(key)
+ print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")
+
+else:
+ print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
+ print("Available files:")
+ if os.path.exists(model_path):
+ for item in sorted(os.listdir(model_path)):
+ print(f" {item}")
+ else:
+ print(f" Directory {model_path} does not exist")
+ exit(1)
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/perplexity-gen.sh b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-gen.sh
new file mode 100755
index 0000000..ef4b650
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-gen.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+set -e
+
+CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+BUILD_DIR="${2:-"$BUILD_DIR"}"
+
+# Final check if we have a model path
+if [ -z "$CONVERTED_MODEL" ]; then
+ echo "Error: Model path must be provided either as:" >&2
+ echo " 1. Command line argument" >&2
+ echo " 2. CONVERTED_MODEL environment variable" >&2
+ exit 1
+fi
+
+# Check if data/wikitext-2-raw directory exists
+if [ ! -d "ppl/wikitext-2-raw" ]; then
+ echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
+ mkdir -p ppl
+ pushd ppl
+ ./../../../scripts/get-wikitext-2.sh
+ popd
+fi
+
+mkdir -p ppl
+OUTPUTFILE="ppl/$(basename $CONVERTED_MODEL).kld"
+echo "Model: $CONVERTED_MODEL"
+
+if [ -z "$BUILD_DIR" ]; then
+ BUILD_DIR="../../build"
+fi
+
+cmake --build $BUILD_DIR --target llama-perplexity -j8
+
+${BUILD_DIR}/bin/llama-perplexity -m $CONVERTED_MODEL \
+ -f ppl/wikitext-2-raw/wiki.test.raw \
+ --kl-divergence-base $OUTPUTFILE
+
+echo "Generated logits in $OUTPUTFILE"
+
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run-simple.sh b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
new file mode 100755
index 0000000..20ee965
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+set -e
+
+QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
+BUILD_DIR="${2:-"$BUILD_DIR"}"
+
+if [ -z "$QUANTIZED_MODEL" ]; then
+ echo "Error: Model path must be provided either as:" >&2
+ echo " 1. Command line argument" >&2
+ echo " 2. QUANTIZED_MODEL environment variable" >&2
+ exit 1
+fi
+
+# Check if data/wikitext-2-raw directory exists
+if [ ! -d "ppl/wikitext-2-raw" ]; then
+ echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
+ mkdir -p ppl
+ pushd ppl
+ ./../../../scripts/get-wikitext-2.sh
+ popd
+fi
+
+if [ -z "$BUILD_DIR" ]; then
+ BUILD_DIR="../../build"
+fi
+
+cmake --build $BUILD_DIR --target llama-perplexity -j8
+
+${BUILD_DIR}/bin/llama-perplexity -m $QUANTIZED_MODEL -f ppl/wikitext-2-raw/wiki.test.raw
+
+
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run.sh b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run.sh
new file mode 100755
index 0000000..c11f32c
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+set -e
+
+QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
+LOGITS_FILE="${2:-"$LOGITS_FILE"}"
+BUILD_DIR="${3:-"$BUILD_DIR"}"
+
+if [ -z "$QUANTIZED_MODEL" ]; then
+ echo "Error: Model path must be provided either as:" >&2
+ echo " 1. Command line argument" >&2
+ echo " 2. QUANTIZED_MODEL environment variable" >&2
+ exit 1
+fi
+
+if [ ! -f ${LOGITS_FILE} ]; then
+ echo "Error: logits file '${LOGITS_FILE} was not found"
+ echo "Did you run the perplexity-gen.sh script?"
+ exit 1
+fi
+
+if [ -z "$BUILD_DIR" ]; then
+ BUILD_DIR="../../build"
+fi
+
+echo "Model: $QUANTIZED_MODEL"
+echo "Data file: $LOGITS_FILE"
+
+cmake --build $BUILD_DIR --target llama-perplexity -j8
+
+${BUILD_DIR}/bin/llama-perplexity -m $QUANTIZED_MODEL \
+ --kl-divergence-base $LOGITS_FILE \
+ --kl-divergence
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/quantize.sh b/llama.cpp/examples/model-conversion/scripts/utils/quantize.sh
new file mode 100755
index 0000000..4c21a13
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/quantize.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+set -e
+
+CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
+TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}"
+OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}"
+BUILD_DIR="${5:-"$BUILD_DIR"}"
+QUANTIZED_MODEL=$CONVERTED_MODEL
+
+# Final check if we have a model path
+if [ -z "$CONVERTED_MODEL" ]; then
+ echo "Error: Model path must be provided either as:" >&2
+ echo " 1. Command line argument" >&2
+ echo " 2. CONVERTED_MODEL environment variable" >&2
+ exit 1
+fi
+
+if [ -z "$QUANTIZED_TYPE" ]; then
+ echo "Error: QUANTIZED_TYPE is required" >&2
+ exit 1
+fi
+
+echo $CONVERTED_MODEL
+
+# Process the quantized model filename
+if [[ "$QUANTIZED_MODEL" == *.gguf ]]; then
+ # Remove .gguf suffix, add quantized type, then add .gguf back
+ BASE_NAME="${QUANTIZED_MODEL%.gguf}"
+ QUANTIZED_MODEL="${BASE_NAME}-${QUANTIZED_TYPE}.gguf"
+else
+ echo "Error: QUANTIZED_MODEL must end with .gguf extension" >&2
+ exit 1
+fi
+
+if [ -z "$BUILD_DIR" ]; then
+ BUILD_DIR="../../build"
+fi
+
+cmake --build $BUILD_DIR --target llama-quantize -j8
+
+echo $TOKEN_EMBD_TYPE
+echo $OUTPUT_TYPE
+
+CMD_ARGS=("${BUILD_DIR}/bin/llama-quantize")
+[[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE")
+[[ -n "$OUTPUT_TYPE" ]] && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE")
+CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE")
+
+"${CMD_ARGS[@]}"
+
+echo "Quantized model saved to: $QUANTIZED_MODEL"
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/run-embedding-server.sh b/llama.cpp/examples/model-conversion/scripts/utils/run-embedding-server.sh
new file mode 100755
index 0000000..9f5fc2c
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/run-embedding-server.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+set -e
+#
+# First try command line argument, then environment variable, then file
+CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+BUILD_DIR="${2:-"$BUILD_DIR"}"
+
+# Final check if we have a model path
+if [ -z "$CONVERTED_MODEL" ]; then
+ echo "Error: Model path must be provided either as:" >&2
+ echo " 1. Command line argument" >&2
+ echo " 2. CONVERTED_MODEL environment variable" >&2
+ exit 1
+fi
+
+if [ -z "$BUILD_DIR" ]; then
+ BUILD_DIR="../../build"
+fi
+
+echo $CONVERTED_MODEL
+
+cmake --build $BUILD_DIR --target llama-server
+
+${BUILD_DIR}/bin/llama-server -m $CONVERTED_MODEL \
+ --embedding \
+ --pooling none
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py b/llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py
new file mode 100644
index 0000000..73e20ea
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import argparse
+import os
+import importlib
+from pathlib import Path
+
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
+from common import compare_tokens, exit_with_warning # type: ignore[import-not-found]
+
+unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
+
+def cosine_similarity(a, b=None):
+ a = np.asarray(a)
+ if b is None:
+ b = a
+ else:
+ b = np.asarray(b)
+
+ if a.ndim == 1:
+ a = a.reshape(1, -1)
+ if b.ndim == 1:
+ b = b.reshape(1, -1)
+
+ a_norms = np.linalg.norm(a, axis=1, keepdims=True)
+ b_norms = np.linalg.norm(b, axis=1, keepdims=True)
+
+ a_norms = np.where(a_norms == 0, 1e-8, a_norms)
+ b_norms = np.where(b_norms == 0, 1e-8, b_norms)
+
+ a_normalized = a / a_norms
+ b_normalized = b / b_norms
+
+ # Compute cosine similarity
+ return np.dot(a_normalized, b_normalized.T)
+
+def load_embeddings_from_file(filename, n_tokens, n_embd):
+ embeddings = np.fromfile(filename, dtype=np.float32)
+ # Check if this is pooled (single embedding) or per-token embeddings
+ if len(embeddings) == n_embd:
+ return embeddings.reshape(1, n_embd)
+ else:
+ return embeddings.reshape(n_tokens, n_embd)
+
+def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
+ np.set_printoptions(suppress=True, precision=6)
+ print("pytorch embeddings:");
+ print(python_emb)
+ print("llama.cpp embeddings:");
+ print(cpp_emb)
+ print(f"\n=== Prompt: '{prompt}' ===")
+ print(f"Tokens: {tokens}")
+ print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")
+
+ n_tokens = len(tokens)
+ is_pooled = python_emb.shape[0] == 1
+
+ if is_pooled:
+ print(f"\n[Pooled Embeddings Mode - comparing single sentence embeddings]")
+
+ # 1. Direct embedding comparison for pooled embeddings
+ print(f"\n1. Raw Embedding Magnitude Comparison:")
+ py_mag = np.linalg.norm(python_emb[0])
+ cpp_mag = np.linalg.norm(cpp_emb[0])
+ ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
+ print(f" Pooled embedding: Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
+
+ # 2. Cross-model similarity for pooled embeddings
+ print(f"\n2. Cross-Model Pooled Embedding Similarity:")
+ sim = cosine_similarity([python_emb[0]], [cpp_emb[0]])[0][0]
+ print(f" Cosine similarity: {sim:.6f}")
+
+ return {
+ 'cross_model_similarities': [sim],
+ 'similarity_matrix_diff': np.array([[0.0]]),
+ 'max_diff': 0.0,
+ 'mean_diff': 0.0,
+ 'rms_diff': 0.0
+ }
+ else:
+ # Original per-token comparison logic
+ # 1. Direct embedding comparison
+ print(f"\n1. Raw Embedding Magnitude Comparison:")
+ # Check if the distance of each token embedding from the origin and compare
+ # if the vectors are on the same "sphere". This does not tell us about
+ # direction (meaning of the token embedding), just magnitude.
+ for i in range(n_tokens):
+ py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
+ cpp_mag = np.linalg.norm(cpp_emb[i]) # calculate standard euclidean norm for llama.cpp embeddings
+ ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
+ print(f" Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
+
+ # 2. Cosine similarity between tokens within each model
+ # Here we check the direction of token embeddings to see if the have the
+ # same meaning (similarity). This is done by calculating cosine similarity
+ # of a pair of token embeddings within each model.
+ print(f"\n2. Within-Model Token Similarities:")
+ print(" Python model:")
+ for i in range(n_tokens):
+ for j in range(i+1, n_tokens):
+ sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
+ print(f" {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
+
+ print(" llama.cpp model:")
+ for i in range(n_tokens):
+ for j in range(i+1, n_tokens):
+ sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
+ print(f" {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
+
+ # 3. Cross-model similarity (same token position)
+ print(f"\n3. Cross-Model Same-Token Similarities:")
+ for i in range(n_tokens):
+ sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
+ print(f" Token {i} ({tokens[i]}): {sim:.4f}")
+
+ # 4. Similarity matrix comparison
+ print(f"\n4. Similarity Matrix Differences:")
+ py_sim_matrix = cosine_similarity(python_emb)
+ cpp_sim_matrix = cosine_similarity(cpp_emb)
+ diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
+
+ print(f" Max difference: {np.max(diff_matrix):.4f}")
+ print(f" Mean difference: {np.mean(diff_matrix):.4f}")
+ print(f" RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
+
+ return {
+ 'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
+ 'similarity_matrix_diff': diff_matrix,
+ 'max_diff': np.max(diff_matrix),
+ 'mean_diff': np.mean(diff_matrix),
+ 'rms_diff': np.sqrt(np.mean(diff_matrix**2))
+ }
+
+def read_prompt_from_file(file_path):
+ try:
+ with open(file_path, 'r', encoding='utf-8') as f:
+ return f.read().strip()
+ except FileNotFoundError:
+ print(f"Error: Prompts file '{file_path}' not found")
+ exit(1)
+ except Exception as e:
+ print(f"Error reading prompts file: {e}")
+ exit(1)
+
+def main():
+ parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
+ parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
+ parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file')
+ parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
+ parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
+ parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
+ parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts')
+
+ args = parser.parse_args()
+
+ if args.prompts_file:
+ prompt = read_prompt_from_file(args.prompts_file)
+ else:
+ prompt = args.prompt
+
+ python_emb_path = Path(args.python_embeddings)
+ cpp_emb_path = Path(args.cpp_embeddings)
+
+ # Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name")
+ python_model_name = python_emb_path.stem.replace("-embeddings", "")
+ cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "")
+
+ print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
+ print("=" * 70)
+
+ # First verify tokens match before comparing embeddings
+ print("\nšŸ” Token Comparison Check")
+ print("=" * 70)
+ data_dir = python_emb_path.parent
+ if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)):
+ exit_with_warning("\nāŒ Token mismatch detected", args.model_path)
+ print()
+
+ # Single prompt detailed comparison
+ print(f"\nTesting with prompt: '{prompt}'")
+
+ # Load the python model to get configuration information and also to load the tokenizer.
+ print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
+ tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+ config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
+
+ if unreleased_model_name:
+ model_name_lower = unreleased_model_name.lower()
+ unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+ if args.causal:
+ class_name = f"{unreleased_model_name}ForCausalLM"
+ else:
+ class_name = f"{unreleased_model_name}Model"
+ print(f"Model class: {class_name}")
+ print(f"Importing unreleased model module: {unreleased_module_path}")
+
+ try:
+ model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
+ model = model_class.from_pretrained(args.model_path)
+ except (ImportError, AttributeError) as e:
+ print(f"Failed to import or load model: {e}")
+ exit(1)
+ else:
+ if args.causal:
+ model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True)
+ else:
+ model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
+
+ encoded = tokenizer(prompt, return_tensors="pt")
+ tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
+ n_tokens = len(tokens)
+ print(f"n_tokens: {n_tokens}");
+ print(f"hidden_size: {model.config.hidden_size}")
+
+ # Load binary embeddings from data directory.
+ llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size)
+ python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
+
+ # Run comparison
+ results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt)
+
+ # Summary
+ print(f"\n=== SUMMARY ===")
+ avg_cross_sim = np.mean(results['cross_model_similarities'])
+ print(f"Average cross-model similarity: {avg_cross_sim:.4f}")
+ print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}")
+
+ # Quality assessment
+ if avg_cross_sim > 0.95:
+ print("āœ… EXCELLENT: Models are highly similar")
+ elif avg_cross_sim > 0.90:
+ print("āœ… VERY GOOD: Models are very similar")
+ elif avg_cross_sim > 0.80:
+ print("āš ļø GOOD: Models are reasonably similar")
+ elif avg_cross_sim > 0.70:
+ print("āš ļø FAIR: Models have some differences")
+ else:
+ exit_with_warning("āŒ POOR: Models are significantly different", args.model_path)
+
+if __name__ == "__main__":
+ main()
diff --git a/llama.cpp/examples/model-conversion/scripts/utils/tensor-info.py b/llama.cpp/examples/model-conversion/scripts/utils/tensor-info.py
new file mode 100755
index 0000000..12a3430
--- /dev/null
+++ b/llama.cpp/examples/model-conversion/scripts/utils/tensor-info.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Optional
+from safetensors import safe_open
+
+
+MODEL_SAFETENSORS_FILE = "model.safetensors"
+MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json"
+
+
+def get_weight_map(model_path: Path) -> Optional[dict[str, str]]:
+ index_file = model_path / MODEL_SAFETENSORS_INDEX
+
+ if index_file.exists():
+ with open(index_file, 'r') as f:
+ index = json.load(f)
+ return index.get("weight_map", {})
+
+ return None
+
+
+def get_all_tensor_names(model_path: Path) -> list[str]:
+ weight_map = get_weight_map(model_path)
+
+ if weight_map is not None:
+ return list(weight_map.keys())
+
+ single_file = model_path / MODEL_SAFETENSORS_FILE
+ if single_file.exists():
+ try:
+ with safe_open(single_file, framework="pt", device="cpu") as f:
+ return list(f.keys())
+ except Exception as e:
+ print(f"Error reading {single_file}: {e}")
+ sys.exit(1)
+
+ print(f"Error: No safetensors files found in {model_path}")
+ sys.exit(1)
+
+
+def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]:
+ weight_map = get_weight_map(model_path)
+
+ if weight_map is not None:
+ return weight_map.get(tensor_name)
+
+ single_file = model_path / MODEL_SAFETENSORS_FILE
+ if single_file.exists():
+ return single_file.name
+
+ return None
+
+
+def normalize_tensor_name(tensor_name: str) -> str:
+ normalized = re.sub(r'\.\d+\.', '.#.', tensor_name)
+ normalized = re.sub(r'\.\d+$', '.#', normalized)
+ return normalized
+
+
+def list_all_tensors(model_path: Path, unique: bool = False):
+ tensor_names = get_all_tensor_names(model_path)
+
+ if unique:
+ seen = set()
+ for tensor_name in sorted(tensor_names):
+ normalized = normalize_tensor_name(tensor_name)
+ if normalized not in seen:
+ seen.add(normalized)
+ print(normalized)
+ else:
+ for tensor_name in sorted(tensor_names):
+ print(tensor_name)
+
+
+def print_tensor_info(model_path: Path, tensor_name: str):
+ tensor_file = find_tensor_file(model_path, tensor_name)
+
+ if tensor_file is None:
+ print(f"Error: Could not find tensor '{tensor_name}' in model index")
+ print(f"Model path: {model_path}")
+ sys.exit(1)
+
+ file_path = model_path / tensor_file
+
+ try:
+ with safe_open(file_path, framework="pt", device="cpu") as f:
+ if tensor_name in f.keys():
+ tensor_slice = f.get_slice(tensor_name)
+ shape = tensor_slice.get_shape()
+ print(f"Tensor: {tensor_name}")
+ print(f"File: {tensor_file}")
+ print(f"Shape: {shape}")
+ else:
+ print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}")
+ sys.exit(1)
+
+ except FileNotFoundError:
+ print(f"Error: The file '{file_path}' was not found.")
+ sys.exit(1)
+ except Exception as e:
+ print(f"An error occurred: {e}")
+ sys.exit(1)
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Print tensor information from a safetensors model"
+ )
+ parser.add_argument(
+ "tensor_name",
+ nargs="?", # optional (if --list is used for example)
+ help="Name of the tensor to inspect"
+ )
+ parser.add_argument(
+ "-m", "--model-path",
+ type=Path,
+ help="Path to the model directory (default: MODEL_PATH environment variable)"
+ )
+ parser.add_argument(
+ "-l", "--list",
+ action="store_true",
+ help="List unique tensor patterns in the model (layer numbers replaced with #)"
+ )
+
+ args = parser.parse_args()
+
+ model_path = args.model_path
+ if model_path is None:
+ model_path_str = os.environ.get("MODEL_PATH")
+ if model_path_str is None:
+ print("Error: --model-path not provided and MODEL_PATH environment variable not set")
+ sys.exit(1)
+ model_path = Path(model_path_str)
+
+ if not model_path.exists():
+ print(f"Error: Model path does not exist: {model_path}")
+ sys.exit(1)
+
+ if not model_path.is_dir():
+ print(f"Error: Model path is not a directory: {model_path}")
+ sys.exit(1)
+
+ if args.list:
+ list_all_tensors(model_path, unique=True)
+ else:
+ if args.tensor_name is None:
+ print("Error: tensor_name is required when not using --list")
+ sys.exit(1)
+ print_tensor_info(model_path, args.tensor_name)
+
+
+if __name__ == "__main__":
+ main()