diff options
Diffstat (limited to 'llama.cpp/examples/model-conversion/scripts')
32 files changed, 2561 insertions, 0 deletions
diff --git a/llama.cpp/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh b/llama.cpp/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh new file mode 100755 index 0000000..2ae4dc7 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +set -e + +MODEL_PATH="${1:-"$MODEL_PATH"}" +MODEL_NAME="${2:-$(basename "$MODEL_PATH")}" + +CONVERTED_MODEL_PATH="${1:-"$CONVERTED_MODEL"}" +CONVERTED_MODEL_NAME="${2:-$(basename "$CONVERTED_MODEL_PATH" ".gguf")}" + +if [ -t 0 ]; then + CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin" +else + # Process piped JSON data and convert to binary (matching logits.cpp format) + TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn) + python3 -c " +import json +import sys +import struct + +data = json.load(sys.stdin) + +# Flatten all embeddings completely +flattened = [] +for item in data: + embedding = item['embedding'] + for token_embedding in embedding: + flattened.extend(token_embedding) + +print(f'Total embedding values: {len(flattened)}', file=sys.stderr) + +# Write as binary floats - matches logitc.cpp fwrite format +with open('$TEMP_FILE', 'wb') as f: + for value in flattened: + f.write(struct.pack('f', value)) +" + CPP_EMBEDDINGS="$TEMP_FILE" + trap "rm -f $TEMP_FILE" EXIT +fi + +python scripts/utils/semantic_check.py --model-path $MODEL_PATH \ + --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \ + --cpp-embeddings $CPP_EMBEDDINGS \ + --prompt "Hello world today" \ + --causal + diff --git a/llama.cpp/examples/model-conversion/scripts/causal/compare-logits.py b/llama.cpp/examples/model-conversion/scripts/causal/compare-logits.py new file mode 100755 index 0000000..83bd14c --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/causal/compare-logits.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 + +import sys +import numpy as np +from pathlib import Path +import os + +# Add utils directory to path for direct script execution +sys.path.insert(0, str(Path(__file__).parent.parent / "utils")) +from common import get_model_name_from_env_path, compare_tokens, exit_with_warning # type: ignore[import-not-found] + +def quick_logits_check(pytorch_file, llamacpp_file): + """Lightweight sanity check before NMSE""" + + try: + pytorch_logits = np.fromfile(pytorch_file, dtype=np.float32) + llamacpp_logits = np.fromfile(llamacpp_file, dtype=np.float32) + except Exception as e: + print(f"ā NOK: Failed to load files - {e}") + return False + + # Check shapes match + if pytorch_logits.shape != llamacpp_logits.shape: + print(f"ā NOK: Shape mismatch - PyTorch: {pytorch_logits.shape}, llama.cpp: {llamacpp_logits.shape}") + return False + + # Calculate key metrics + diff = pytorch_logits - llamacpp_logits + abs_diff = np.abs(diff) + max_diff = np.max(abs_diff) + + # Get top 10 predictions from both models + pytorch_top10 = np.argsort(pytorch_logits)[-10:][::-1] + llamacpp_top10 = np.argsort(llamacpp_logits)[-10:][::-1] + print(f"Top 10 PyTorch logits: {pytorch_logits[pytorch_top10]}") + print(f"Top 10 llama.cpp logits: {llamacpp_logits[llamacpp_top10]}") + print(f"Max absolute difference: {max_diff:.4f}") + + return True + +def main(): + model_path = os.environ.get('MODEL_PATH') + model_name = get_model_name_from_env_path('MODEL_PATH') + data_dir = Path("data") + pytorch_file = data_dir / f"pytorch-{model_name}.bin" + + llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL') + print(f"Using converted model: {llamacpp_model_name}") + llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin" + + if not pytorch_file.exists(): + print(f"Error: PyTorch logits file not found: {pytorch_file}") + print("Please run scripts/run-org-model.sh first to generate this file.") + sys.exit(1) + + if not llamacpp_file.exists(): + print(f"Error: llama.cpp logits file not found: {llamacpp_file}") + print("Please run scripts/run-converted-model.sh first to generate this file.") + sys.exit(1) + + print("Checked all required files were found. Proceeding...\n") + + # Verify tokens as they are a prerequisite for logits comparison. + print("š Token Comparison Check") + print("=" * 40) + if not compare_tokens(f"pytorch-{model_name}", f"llamacpp-{llamacpp_model_name}"): + exit_with_warning("\nā Token mismatch detected", model_path) + print() + + print("š GGML Model Validation for model ", model_name) + print("=" * 40) + print(f"PyTorch logits : {pytorch_file}") + print(f"llama.cpp logits: {llamacpp_file}") + print() + + success = quick_logits_check(pytorch_file, llamacpp_file) + + # Exit with appropriate code + if success: + print("ā
OK: Lightweight model check successful!") + print(" Ok to proceed with NMSE check...") + sys.exit(0) + else: + exit_with_warning(f"ā NOK: Top 10 predictions don't match - generation will differ", model_path) + +if __name__ == "__main__": + main() diff --git a/llama.cpp/examples/model-conversion/scripts/causal/convert-model.sh b/llama.cpp/examples/model-conversion/scripts/causal/convert-model.sh new file mode 100755 index 0000000..a5865f6 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/causal/convert-model.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash + +set -e + +# Parse command line arguments +MMPROJ="" +DEBUG="" +while [[ $# -gt 0 ]]; do + case $1 in + --mmproj) + MMPROJ="--mmproj" + shift + ;; + --debug) + DEBUG="1" + shift + ;; + *) + shift + ;; + esac +done + +MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}" +OUTPUT_DIR="${OUTPUT_DIR:-../../models}" +TYPE="${OUTTYPE:-f16}" +METADATA_OVERRIDE="${METADATA_OVERRIDE:-}" +CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf" + +echo "Model path: ${MODEL_PATH}" +echo "Model name: ${MODEL_NAME}" +echo "Data type: ${TYPE}" +echo "Converted model path:: ${CONVERTED_MODEL}" +echo "Metadata override: ${METADATA_OVERRIDE}" + +if [[ -n "$DEBUG" ]]; then + CMD_ARGS=("python" "-m" "pdb") +else + CMD_ARGS=("python") +fi +CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose") +CMD_ARGS+=("${MODEL_PATH}") +CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}") +CMD_ARGS+=("--outtype" "${TYPE}") +[[ -n "$METADATA_OVERRIDE" ]] && CMD_ARGS+=("--metadata" "${METADATA_OVERRIDE}") +[[ -n "$MMPROJ" ]] && CMD_ARGS+=("${MMPROJ}") + +"${CMD_ARGS[@]}" + +echo "" +echo "The environment variable CONVERTED_MODEL can be set to this path using:" +echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})" +if [[ -n "$MMPROJ" ]]; then + mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")" + echo "The mmproj model was created in $(realpath "$mmproj_file")" +fi diff --git a/llama.cpp/examples/model-conversion/scripts/causal/modelcard.template b/llama.cpp/examples/model-conversion/scripts/causal/modelcard.template new file mode 100644 index 0000000..a045950 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/causal/modelcard.template @@ -0,0 +1,13 @@ +--- +base_model: +- {base_model} +--- +# {model_name} GGUF + +Recommended way to run this model: + +```sh +llama-server -hf {namespace}/{model_name}-GGUF +``` + +Then, access http://localhost:8080 diff --git a/llama.cpp/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py b/llama.cpp/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py new file mode 100755 index 0000000..4ab778f --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 + +import argparse +import os +import importlib +import torch +import numpy as np + +from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM +from pathlib import Path + +unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME') + +parser = argparse.ArgumentParser(description='Process model with specified path') +parser.add_argument('--model-path', '-m', help='Path to the model') +args = parser.parse_args() + +model_path = os.environ.get('MODEL_PATH', args.model_path) +if model_path is None: + parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable") + +config = AutoConfig.from_pretrained(model_path) + +print("Model type: ", config.model_type) +print("Vocab size: ", config.vocab_size) +print("Hidden size: ", config.hidden_size) +print("Number of layers: ", config.num_hidden_layers) +print("BOS token id: ", config.bos_token_id) +print("EOS token id: ", config.eos_token_id) + +print("Loading model and tokenizer using AutoTokenizer:", model_path) +tokenizer = AutoTokenizer.from_pretrained(model_path) + +if unreleased_model_name: + model_name_lower = unreleased_model_name.lower() + unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + class_name = f"{unreleased_model_name}ForCausalLM" + print(f"Importing unreleased model module: {unreleased_module_path}") + + try: + model_class = getattr(importlib.import_module(unreleased_module_path), class_name) + model = model_class.from_pretrained(model_path) + except (ImportError, AttributeError) as e: + print(f"Failed to import or load model: {e}") + print("Falling back to AutoModelForCausalLM") + model = AutoModelForCausalLM.from_pretrained(model_path) +else: + model = AutoModelForCausalLM.from_pretrained(model_path) +print(f"Model class: {type(model)}") +#print(f"Model file: {type(model).__module__}") + +model_name = os.path.basename(model_path) +print(f"Model name: {model_name}") + +prompt = "Hello world today" +input_ids = tokenizer(prompt, return_tensors="pt").input_ids +print(f"Input tokens: {input_ids}") +print(f"Input text: {repr(prompt)}") +print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") + +with torch.no_grad(): + outputs = model(input_ids, output_hidden_states=True) + + # Extract hidden states from the last layer + # outputs.hidden_states is a tuple of (num_layers + 1) tensors + # Index -1 gets the last layer, shape: [batch_size, seq_len, hidden_size] + last_hidden_states = outputs.hidden_states[-1] + + # Get embeddings for all tokens + token_embeddings = last_hidden_states[0].float().cpu().numpy() # Remove batch dimension + + print(f"Hidden states shape: {last_hidden_states.shape}") + print(f"Token embeddings shape: {token_embeddings.shape}") + print(f"Hidden dimension: {token_embeddings.shape[-1]}") + print(f"Number of tokens: {token_embeddings.shape[0]}") + + # Save raw token embeddings + data_dir = Path("data") + data_dir.mkdir(exist_ok=True) + bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin" + txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt" + + # Save all token embeddings as binary + print(token_embeddings) + token_embeddings.astype(np.float32).tofile(bin_filename) + + # Save as text for inspection + with open(txt_filename, "w") as f: + for i, embedding in enumerate(token_embeddings): + for j, val in enumerate(embedding): + f.write(f"{i} {j} {val:.6f}\n") + + # Print embeddings per token in the requested format + print("\nToken embeddings:") + tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) + for i, embedding in enumerate(token_embeddings): + # Format: show first few values, ..., then last few values + if len(embedding) > 10: + # Show first 3 and last 3 values with ... in between + first_vals = " ".join(f"{val:8.6f}" for val in embedding[:3]) + last_vals = " ".join(f"{val:8.6f}" for val in embedding[-3:]) + print(f"embedding {i}: {first_vals} ... {last_vals}") + else: + # If embedding is short, show all values + vals = " ".join(f"{val:8.6f}" for val in embedding) + print(f"embedding {i}: {vals}") + + # Also show token info for reference + print(f"\nToken reference:") + for i, token in enumerate(tokens): + print(f" Token {i}: {repr(token)}") + + print(f"Saved bin logits to: {bin_filename}") + print(f"Saved txt logist to: {txt_filename}") diff --git a/llama.cpp/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh b/llama.cpp/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh new file mode 100755 index 0000000..1b5ff86 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +set -e + +# First try command line argument, then environment variable, then file +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" +BUILD_DIR="${2:-"$BUILD_DIR"}" + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +if [ -z "$BUILD_DIR" ]; then + BUILD_DIR="../../build" +fi + +cmake --build ${BUILD_DIR} --target llama-debug -j8 + +${BUILD_DIR}/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits diff --git a/llama.cpp/examples/model-conversion/scripts/causal/run-converted-model.sh b/llama.cpp/examples/model-conversion/scripts/causal/run-converted-model.sh new file mode 100755 index 0000000..b684804 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/causal/run-converted-model.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +set -e + +# First try command line argument, then environment variable, then file +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" +MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}" +BUILD_DIR="${3:-"$BUILD_DIR"}" + +if [ -z "$MODEL_TESTING_PROMPT" ]; then + MODEL_TESTING_PROMPT="Hello, my name is" +fi + +if [ -z "$BUILD_DIR" ]; then + BUILD_DIR="../../build" +fi + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +echo $CONVERTED_MODEL +echo $MODEL_TESTING_PROMPT + +cmake --build ${BUILD_DIR} --target llama-debug -j8 + +${BUILD_DIR}/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits diff --git a/llama.cpp/examples/model-conversion/scripts/causal/run-org-model.py b/llama.cpp/examples/model-conversion/scripts/causal/run-org-model.py new file mode 100755 index 0000000..215f1a9 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/causal/run-org-model.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys +import importlib +import torch +import numpy as np + +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, AutoConfig + +# Add parent directory to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) +from utils.common import debug_hook, save_output_data + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Process model with specified path") + parser.add_argument("--model-path", "-m", help="Path to the model") + parser.add_argument("--prompt-file", "-f", help="Optional prompt file", required=False) + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose debug output") + parser.add_argument("--device", "-d", help="Device to use (cpu, cuda, mps, auto)", default="auto") + return parser.parse_args() + +def load_model_and_tokenizer(model_path, device="auto"): + print("Loading model and tokenizer using AutoTokenizer:", model_path) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + multimodal = False + full_config = config + + # Determine device_map based on device argument + if device == "cpu": + device_map = {"": "cpu"} + print("Forcing CPU usage") + elif device == "auto": + device_map = "auto" + else: + device_map = {"": device} + + print("Model type: ", config.model_type) + if "vocab_size" not in config and "text_config" in config: + config = config.text_config + multimodal = True + + print("Vocab size: ", config.vocab_size) + print("Hidden size: ", config.hidden_size) + print("Number of layers: ", config.num_hidden_layers) + print("BOS token id: ", config.bos_token_id) + print("EOS token id: ", config.eos_token_id) + + unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME") + if unreleased_model_name: + model_name_lower = unreleased_model_name.lower() + unreleased_module_path = ( + f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + ) + class_name = f"{unreleased_model_name}ForCausalLM" + print(f"Importing unreleased model module: {unreleased_module_path}") + + try: + model_class = getattr(importlib.import_module(unreleased_module_path), class_name) + model = model_class.from_pretrained( + model_path, + device_map=device_map, + offload_folder="offload", + trust_remote_code=True, + config=config + ) + except (ImportError, AttributeError) as e: + print(f"Failed to import or load model: {e}") + exit(1) + else: + if multimodal: + model = AutoModelForImageTextToText.from_pretrained( + model_path, + device_map=device_map, + offload_folder="offload", + trust_remote_code=True, + config=full_config + ) + else: + model = AutoModelForCausalLM.from_pretrained( + model_path, + device_map=device_map, + offload_folder="offload", + trust_remote_code=True, + config=config + ) + + print(f"Model class: {model.__class__.__name__}") + + return model, tokenizer, config + +def enable_torch_debugging(model): + for name, module in model.named_modules(): + if len(list(module.children())) == 0: # only leaf modules + module.register_forward_hook(debug_hook(name)) + +def get_prompt(args): + if args.prompt_file: + with open(args.prompt_file, encoding='utf-8') as f: + return f.read() + elif os.getenv("MODEL_TESTING_PROMPT"): + return os.getenv("MODEL_TESTING_PROMPT") + else: + return "Hello, my name is" + +def main(): + args = parse_arguments() + model_path = os.environ.get("MODEL_PATH", args.model_path) + if model_path is None: + print("Error: Model path must be specified either via --model-path argument or MODEL_PATH environment variable") + sys.exit(1) + + + model, tokenizer, config = load_model_and_tokenizer(model_path, args.device) + + if args.verbose: + enable_torch_debugging(model) + + model_name = os.path.basename(model_path) + + # Iterate over the model parameters (the tensors) and get the first one + # and use it to get the device the model is on. + device = next(model.parameters()).device + prompt = get_prompt(args) + input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) + token_ids = input_ids[0].cpu().tolist() + + print(f"Input tokens: {input_ids}") + print(f"Input text: {repr(prompt)}") + print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}") + + batch_size = 512 + + with torch.no_grad(): + past = None + outputs = None + for i in range(0, input_ids.size(1), batch_size): + print(f"Processing chunk with tokens {i} to {i + batch_size}") + chunk = input_ids[:, i:i + batch_size] + outputs = model(chunk.to(model.device), past_key_values=past, use_cache=True) + past = outputs.past_key_values + + logits = outputs.logits # type: ignore + + # Extract logits for the last token (next token prediction) + last_logits = logits[0, -1, :].float().cpu().numpy() + + print(f"Logits shape: {logits.shape}") + print(f"Last token logits shape: {last_logits.shape}") + print(f"Vocab size: {len(last_logits)}") + + # Print some sample logits for quick verification + print(f"First 10 logits: {last_logits[:10]}") + print(f"Last 10 logits: {last_logits[-10:]}") + + # Show top 5 predicted tokens + top_indices = np.argsort(last_logits)[-5:][::-1] + print("Top 5 predictions:") + for idx in top_indices: + token = tokenizer.decode([idx]) + print(f" Token {idx} ({repr(token)}): {last_logits[idx]:.6f}") + + save_output_data(last_logits, token_ids, prompt, model_name) + +if __name__ == "__main__": + main() diff --git a/llama.cpp/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh b/llama.cpp/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh new file mode 100755 index 0000000..984d03e --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash + +set -e + +# Parse command line arguments +MODEL_PATH="" +MODEL_NAME="" +PROMPTS_FILE="" + +# First argument is always model path +if [ $# -gt 0 ] && [[ "$1" != --* ]]; then + MODEL_PATH="$1" + shift +fi + +# Parse remaining arguments +while [[ $# -gt 0 ]]; do + case $1 in + --prompts-file|-pf) + PROMPTS_FILE="$2" + shift 2 + ;; + *) + # If MODEL_NAME not set and this isn't a flag, use as model name + if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then + MODEL_NAME="$1" + fi + shift + ;; + esac +done + +# Set defaults +MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}" +MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}" + +CONVERTED_MODEL_PATH="${CONVERTED_EMBEDDING_PATH:-"$CONVERTED_EMBEDDING_MODEL"}" +CONVERTED_MODEL_NAME="${CONVERTED_MODEL_NAME:-$(basename "$CONVERTED_MODEL_PATH" .gguf)}" + +if [ -t 0 ]; then + CPP_EMBEDDINGS="data/llamacpp-${CONVERTED_MODEL_NAME}-embeddings.bin" +else + # Process piped JSON data and convert to binary (matching logits.cpp format) + TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn) + python3 -c " +import json +import sys +import struct + +data = json.load(sys.stdin) + +# Flatten all embeddings completely +flattened = [] +for item in data: + embedding = item['embedding'] + for token_embedding in embedding: + flattened.extend(token_embedding) + +print(f'Total embedding values: {len(flattened)}', file=sys.stderr) + +# Write as binary floats - matches logitc.cpp fwrite format +with open('$TEMP_FILE', 'wb') as f: + for value in flattened: + f.write(struct.pack('f', value)) +" + CPP_EMBEDDINGS="$TEMP_FILE" + trap "rm -f $TEMP_FILE" EXIT +fi + +# Build the semantic_check.py command +SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \ + --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \ + --cpp-embeddings $CPP_EMBEDDINGS" + +# Add prompts file if specified, otherwise use default prompt +if [ -n "$PROMPTS_FILE" ]; then + SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\"" +else + SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\"" +fi + +# Execute the command +eval $SEMANTIC_CMD + diff --git a/llama.cpp/examples/model-conversion/scripts/embedding/convert-model.sh b/llama.cpp/examples/model-conversion/scripts/embedding/convert-model.sh new file mode 100755 index 0000000..9926350 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/embedding/convert-model.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -e + +# Parse command line arguments +SENTENCE_TRANSFORMERS="" +while [[ $# -gt 0 ]]; do + case $1 in + -st|--sentence-transformers) + SENTENCE_TRANSFORMERS="--sentence-transformers-dense-modules" + shift + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac +done + +MODEL_NAME="${MODEL_NAME:-$(basename "$EMBEDDING_MODEL_PATH")}" +OUTPUT_DIR="${OUTPUT_DIR:-../../models}" +TYPE="${OUTTYPE:-f16}" +METADATA_OVERRIDE="${METADATA_OVERRIDE:-}" +CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf" + +echo "Model path: ${EMBEDDING_MODEL_PATH}" +echo "Model name: ${MODEL_NAME}" +echo "Data type: ${TYPE}" +echo "Converted model path:: ${CONVERTED_MODEL}" +python ../../convert_hf_to_gguf.py --verbose \ + ${EMBEDDING_MODEL_PATH} \ + --outfile ${CONVERTED_MODEL} \ + --outtype ${TYPE} \ + ${SENTENCE_TRANSFORMERS} + +echo "" +echo "The environment variable CONVERTED_EMBEDDING MODEL can be set to this path using:" +echo "export CONVERTED_EMBEDDING_MODEL=$(realpath ${CONVERTED_MODEL})" diff --git a/llama.cpp/examples/model-conversion/scripts/embedding/modelcard.template b/llama.cpp/examples/model-conversion/scripts/embedding/modelcard.template new file mode 100644 index 0000000..9e63042 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/embedding/modelcard.template @@ -0,0 +1,48 @@ +--- +base_model: +- {base_model} +--- +# {model_name} GGUF + +Recommended way to run this model: + +```sh +llama-server -hf {namespace}/{model_name}-GGUF --embeddings +``` + +Then the endpoint can be accessed at http://localhost:8080/embedding, for +example using `curl`: +```console +curl --request POST \ + --url http://localhost:8080/embedding \ + --header "Content-Type: application/json" \ + --data '{{"input": "Hello embeddings"}}' \ + --silent +``` + +Alternatively, the `llama-embedding` command line tool can be used: +```sh +llama-embedding -hf {namespace}/{model_name}-GGUF --verbose-prompt -p "Hello embeddings" +``` + +#### embd_normalize +When a model uses pooling, or the pooling method is specified using `--pooling`, +the normalization can be controlled by the `embd_normalize` parameter. + +The default value is `2` which means that the embeddings are normalized using +the Euclidean norm (L2). Other options are: +* -1 No normalization +* 0 Max absolute +* 1 Taxicab +* 2 Euclidean/L2 +* \>2 P-Norm + +This can be passed in the request body to `llama-server`, for example: +```sh + --data '{{"input": "Hello embeddings", "embd_normalize": -1}}' \ +``` + +And for `llama-embedding`, by passing `--embd-normalize <value>`, for example: +```sh +llama-embedding -hf {namespace}/{model_name}-GGUF --embd-normalize -1 -p "Hello embeddings" +``` diff --git a/llama.cpp/examples/model-conversion/scripts/embedding/run-converted-model.sh b/llama.cpp/examples/model-conversion/scripts/embedding/run-converted-model.sh new file mode 100755 index 0000000..ba8a3af --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/embedding/run-converted-model.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +set -e + +# Parse command line arguments +CONVERTED_MODEL="" +PROMPTS_FILE="" +EMBD_NORMALIZE="2" + +while [[ $# -gt 0 ]]; do + case $1 in + -p|--prompts-file) + PROMPTS_FILE="$2" + shift 2 + ;; + --embd-normalize) + EMBD_NORMALIZE="$2" + shift 2 + ;; + *) + if [ -z "$CONVERTED_MODEL" ]; then + CONVERTED_MODEL="$1" + fi + shift + ;; + esac +done + +# First try command line argument, then environment variable +CONVERTED_MODEL="${CONVERTED_MODEL:-"$CONVERTED_EMBEDDING_MODEL"}" +BUILD_DIR="${BUILD_DIR:-"../../build"}" + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_EMBEDDING_MODEL environment variable" >&2 + exit 1 +fi + +# Read prompt from file or use default +if [ -n "$PROMPTS_FILE" ]; then + if [ ! -f "$PROMPTS_FILE" ]; then + echo "Error: Prompts file '$PROMPTS_FILE' not found" >&2 + exit 1 + fi + PROMPT=$(cat "$PROMPTS_FILE") +else + PROMPT="Hello world today" +fi + +echo $CONVERTED_MODEL + +cmake --build ${BUILD_DIR} --target llama-debug -j8 +${BUILD_DIR}/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE diff --git a/llama.cpp/examples/model-conversion/scripts/embedding/run-original-model.py b/llama.cpp/examples/model-conversion/scripts/embedding/run-original-model.py new file mode 100755 index 0000000..0802cbc --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/embedding/run-original-model.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys +import importlib + +from transformers import AutoTokenizer, AutoConfig, AutoModel +import torch + +# Add parent directory to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) +from utils.common import save_output_data + + +def parse_arguments(): + parser = argparse.ArgumentParser(description='Run original embedding model') + parser.add_argument( + '--model-path', + '-m', + help='Path to the model' + ) + parser.add_argument( + '--prompts-file', + '-p', + help='Path to file containing prompts (one per line)' + ) + parser.add_argument( + '--use-sentence-transformers', + action='store_true', + help=('Use SentenceTransformer to apply all numbered layers ' + '(01_Pooling, 02_Dense, 03_Dense, 04_Normalize)') + ) + parser.add_argument( + '--device', + '-d', + help='Device to use (cpu, cuda, mps, auto)', + default='auto' + ) + return parser.parse_args() + + +def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device="auto"): + if device == "cpu": + device_map = {"": "cpu"} + print("Forcing CPU usage") + elif device == "auto": + # On Mac, "auto" device_map can cause issues with accelerate + # So we detect the best device manually + if torch.cuda.is_available(): + device_map = {"": "cuda"} + print("Using CUDA") + elif torch.backends.mps.is_available(): + device_map = {"": "mps"} + print("Using MPS (Apple Metal)") + else: + device_map = {"": "cpu"} + print("Using CPU") + else: + device_map = {"": device} + + if use_sentence_transformers: + from sentence_transformers import SentenceTransformer + print("Using SentenceTransformer to apply all numbered layers") + model = SentenceTransformer(model_path) + tokenizer = model.tokenizer + config = model[0].auto_model.config # type: ignore + else: + tokenizer = AutoTokenizer.from_pretrained(model_path) + config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + + # This can be used to override the sliding window size for manual testing. This + # can be useful to verify the sliding window attention mask in the original model + # and compare it with the converted .gguf model. + if hasattr(config, 'sliding_window'): + original_sliding_window = config.sliding_window + print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}") + + unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME') + print(f"Using unreleased model: {unreleased_model_name}") + if unreleased_model_name: + model_name_lower = unreleased_model_name.lower() + unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + class_name = f"{unreleased_model_name}Model" + print(f"Importing unreleased model module: {unreleased_module_path}") + + try: + model_class = getattr(importlib.import_module(unreleased_module_path), class_name) + model = model_class.from_pretrained( + model_path, + device_map=device_map, + offload_folder="offload", + trust_remote_code=True, + config=config + ) + except (ImportError, AttributeError) as e: + print(f"Failed to import or load model: {e}") + sys.exit(1) + else: + model = AutoModel.from_pretrained( + model_path, + device_map=device_map, + offload_folder="offload", + trust_remote_code=True, + config=config + ) + print(f"Model class: {type(model)}") + print(f"Model file: {type(model).__module__}") + + # Verify the model is using the correct sliding window + if hasattr(model.config, 'sliding_window'): # type: ignore + print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore + else: + print("Model config does not have sliding_window attribute") + + return model, tokenizer, config + + +def get_prompt(args): + if args.prompts_file: + try: + with open(args.prompts_file, 'r', encoding='utf-8') as f: + return f.read().strip() + except FileNotFoundError: + print(f"Error: Prompts file '{args.prompts_file}' not found") + sys.exit(1) + except Exception as e: + print(f"Error reading prompts file: {e}") + sys.exit(1) + else: + return "Hello world today" + + +def main(): + args = parse_arguments() + + model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path) + if model_path is None: + print("Error: Model path must be specified either via --model-path argument " + "or EMBEDDING_MODEL_PATH environment variable") + sys.exit(1) + + # Determine if we should use SentenceTransformer + use_st = ( + args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes') + ) + + model, tokenizer, config = load_model_and_tokenizer(model_path, use_st, args.device) + + # Get the device the model is on + if not use_st: + device = next(model.parameters()).device + else: + # For SentenceTransformer, get device from the underlying model + device = next(model[0].auto_model.parameters()).device # type: ignore + + model_name = os.path.basename(model_path) + + prompt_text = get_prompt(args) + texts = [prompt_text] + + with torch.no_grad(): + if use_st: + embeddings = model.encode(texts, convert_to_numpy=True) + all_embeddings = embeddings # Shape: [batch_size, hidden_size] + + encoded = tokenizer( + texts, + padding=True, + truncation=True, + return_tensors="pt" + ) + tokens = encoded['input_ids'][0] + token_ids = tokens.cpu().tolist() + token_strings = tokenizer.convert_ids_to_tokens(tokens) + for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)): + print(f"{token_id:6d} -> '{token_str}'") + + print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}") + print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore + else: + # Standard approach: use base model output only + encoded = tokenizer( + texts, + padding=True, + truncation=True, + return_tensors="pt" + ) + + tokens = encoded['input_ids'][0] + token_ids = tokens.cpu().tolist() + token_strings = tokenizer.convert_ids_to_tokens(tokens) + for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)): + print(f"{token_id:6d} -> '{token_str}'") + + # Move inputs to the same device as the model + encoded = {k: v.to(device) for k, v in encoded.items()} + outputs = model(**encoded) + hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size] + + all_embeddings = hidden_states[0].float().cpu().numpy() # Shape: [seq_len, hidden_size] + + print(f"Hidden states shape: {hidden_states.shape}") + print(f"All embeddings shape: {all_embeddings.shape}") + print(f"Embedding dimension: {all_embeddings.shape[1]}") + + if len(all_embeddings.shape) == 1: + n_embd = all_embeddings.shape[0] # type: ignore + n_embd_count = 1 + all_embeddings = all_embeddings.reshape(1, -1) + else: + n_embd = all_embeddings.shape[1] # type: ignore + n_embd_count = all_embeddings.shape[0] # type: ignore + + print() + + for j in range(n_embd_count): + embedding = all_embeddings[j] + print(f"embedding {j}: ", end="") + + # Print first 3 values + for i in range(min(3, n_embd)): + print(f"{embedding[i]:9.6f} ", end="") + + print(" ... ", end="") + + # Print last 3 values + for i in range(n_embd - 3, n_embd): + print(f"{embedding[i]:9.6f} ", end="") + + print() # New line + + print() + + flattened_embeddings = all_embeddings.flatten() + print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings Ć {n_embd} dimensions)") + print("") + + save_output_data(flattened_embeddings, token_ids, prompt_text, model_name, type_suffix="-embeddings") + + +if __name__ == "__main__": + main() diff --git a/llama.cpp/examples/model-conversion/scripts/utils/__init__.py b/llama.cpp/examples/model-conversion/scripts/utils/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/__init__.py diff --git a/llama.cpp/examples/model-conversion/scripts/utils/check-nmse.py b/llama.cpp/examples/model-conversion/scripts/utils/check-nmse.py new file mode 100755 index 0000000..83f63f9 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/check-nmse.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 + +import numpy as np +import sys +import os +import argparse +from pathlib import Path +from common import get_model_name_from_env_path # type: ignore[import-not-found] + +def calculate_nmse(reference, test): + mse = np.mean((test - reference) ** 2) + ref_var = np.var(reference) + if ref_var == 0: + nmse = float('inf') if mse > 0 else 0.0 + return mse, mse, ref_var + + nmse = mse / ref_var + + return nmse, mse, ref_var + +def load_logits(file_path): + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + if file_path.suffix == '.npy': + return np.load(file_path) + elif file_path.suffix == '.bin': + return np.fromfile(file_path, dtype=np.float32) + else: + # Try to load as text file + try: + # If it has index format "0: value", extract just values + data = [] + with open(file_path, 'r') as f: + for line in f: + if ':' in line: + # Format: "index: value" + value = float(line.split(':')[1].strip()) + else: + # Just the value + value = float(line.strip()) + data.append(value) + return np.array(data, dtype=np.float32) + except: + return np.loadtxt(file_path, dtype=np.float32) + +def interpret_nmse(nmse): + """Provide interpretation of NMSE value""" + if nmse == 0: + return "Perfect match", "š" + elif nmse < 1e-6: + return "Essentially identical", "ā
" + elif nmse < 1e-4: + return "Excellent match", "ā
" + elif nmse < 1e-3: + return "Very good match", "š" + elif nmse < 1e-2: + return "Good match", "š" + elif nmse < 0.1: + return "Acceptable match", "ā ļø" + elif nmse < 1.0: + return "Poor match", "ā" + else: + return "Very poor match (worse than noise)", "ā" + +def main(): + parser = argparse.ArgumentParser(description='Validate model logits') + parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory') + args = parser.parse_args() + + model_name = get_model_name_from_env_path('MODEL_PATH') + data_dir = Path("data") + + pytorch_file = data_dir / f"pytorch-{model_name}.bin" + + llamacpp_model_name = get_model_name_from_env_path('CONVERTED_MODEL') + llamacpp_file = data_dir / f"llamacpp-{llamacpp_model_name}.bin" + + print(f"Model name: {model_name}") + print(f"PyTorch logits file: {pytorch_file}") + print(f"llama.cpp logits file: {llamacpp_file}") + + reference_file = pytorch_file + test_file = llamacpp_file + + print("š NMSE Check for Model Comparison") + print("=" * 50) + print(f"Reference (ground truth): {reference_file}") + print(f"Test (to evaluate): {test_file}") + print() + + try: + print("Loading reference logits...") + reference = load_logits(reference_file) + print(f" Shape: {reference.shape}, Type: {reference.dtype}") + + print("Loading test logits...") + test = load_logits(test_file) + print(f" Shape: {test.shape}, Type: {test.dtype}") + + # Check shapes match + if reference.shape != test.shape: + print(f"\nā Error: Shape mismatch!") + print(f" Reference: {reference.shape}") + print(f" Test: {test.shape}") + sys.exit(1) + + print(f"\nā
Shapes match: {reference.shape}") + + nmse, mse, ref_var = calculate_nmse(reference, test) + + # Additional metrics + max_abs_error = np.max(np.abs(test - reference)) + mean_abs_error = np.mean(np.abs(test - reference)) + + # Results + print(f"\nš METRICS") + print("=" * 30) + print(f"MSE (Mean Squared Error): {mse:.6e}") + print(f"Reference Variance: {ref_var:.6e}") + print(f"NMSE: {nmse:.6e}") + print(f"Max Absolute Error: {max_abs_error:.6f}") + print(f"Mean Absolute Error: {mean_abs_error:.6f}") + + # NMSE in dB (common in signal processing) + if nmse > 0: + nmse_db = 10 * np.log10(nmse) + print(f"NMSE (dB): {nmse_db:.2f} dB") + + # Interpretation + interpretation, emoji = interpret_nmse(nmse) + print(f"\nšÆ INTERPRETATION") + print("=" * 30) + print(f"{emoji} {interpretation}") + + # Detailed guidance + print(f"\nš GUIDANCE") + print("=" * 30) + if nmse < 1e-3: + print("ā
EXCELLENT: Your GGML conversion is working very well!") + print(" The differences are negligible for practical use.") + elif nmse < 1e-2: + print("š GOOD: Your GGML conversion is working well.") + print(" Small differences are likely due to precision/quantization.") + elif nmse < 0.1: + print("ā ļø ACCEPTABLE: Conversion is working but with some differences.") + print(" Check if you're using quantization (Q4, Q8, etc.)") + print(" Test generation quality to see if it's acceptable.") + else: + print("ā PROBLEMATIC: Large differences detected.") + print(" Check your conversion process for potential issues.") + print(" Verify you're using the same model weights.") + + # NMSE benchmarks + print(f"\nš NMSE BENCHMARKS") + print("=" * 30) + print("< 1e-6: Essentially identical") + print("< 1e-4: Excellent (typical for good conversions)") + print("< 1e-3: Very good") + print("< 1e-2: Good (acceptable for most use cases)") + print("< 0.1: Acceptable (may need verification)") + print("> 1.0: Poor (worse than random)") + + # Exit code based on NMSE + if nmse < 1e-2: + print(f"\nā
RESULT: PASS (NMSE = {nmse:.2e})") + sys.exit(0) + else: + print(f"\nā RESULT: NEEDS REVIEW (NMSE = {nmse:.2e})") + sys.exit(1) + + except Exception as e: + print(f"ā Error: {e}") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/llama.cpp/examples/model-conversion/scripts/utils/common.py b/llama.cpp/examples/model-conversion/scripts/utils/common.py new file mode 100644 index 0000000..aa4bab2 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/common.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 + +import os +import sys +import torch +import transformers +import json +import textwrap +import numpy as np +from pathlib import Path + + +def get_model_name_from_env_path(env_path_name): + model_path = os.getenv(env_path_name) + if not model_path: + print(f"Error: {env_path_name} environment variable not set") + sys.exit(1) + + if not os.path.exists(model_path): + print(f"Error: Model file not found: {model_path}") + sys.exit(1) + + name = os.path.basename(os.path.normpath(model_path)) + if name.endswith(".gguf"): + name = name[:-5] + + return name + + +def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3): + """ + Print a tensor in llama.cpp debug style. + + Supports: + - 2D tensors (seq, hidden) + - 3D tensors (batch, seq, hidden) + - 4D tensors (batch, seq, heads, dim_per_head) via flattening heads Ć dim_per_head + + Shows first and last max_vals of each vector per sequence position. + """ + t = tensor.detach().to(torch.float32).cpu() + + # Determine dimensions + if t.ndim == 3: + _, s, _ = t.shape + elif t.ndim == 2: + _, s = 1, t.shape[0] + t = t.unsqueeze(0) + elif t.ndim == 4: + _, s, _, _ = t.shape + else: + print(f"Skipping tensor due to unsupported dimensions: {t.ndim}") + return + + ten_shape = t.shape + + print(f"ggml_debug: {name} = (f32) ... = {{{ten_shape}}}") + print(" [") + print(" [") + + # Determine indices for first and last sequences + first_indices = list(range(min(s, max_seq))) + last_indices = list(range(max(0, s - max_seq), s)) + + # Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq + has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s) + + # Combine indices + if has_overlap: + # If there's overlap, just use the combined unique indices + indices = sorted(list(set(first_indices + last_indices))) + separator_index = None + else: + # If no overlap, we'll add a separator between first and last sequences + indices = first_indices + last_indices + separator_index = len(first_indices) + + for i, si in enumerate(indices): + # Add separator if needed + if separator_index is not None and i == separator_index: + print(" ...") + + # Extract appropriate slice + vec = t[0, si] + if vec.ndim == 2: # 4D case: flatten heads Ć dim_per_head + flat = vec.flatten().tolist() + else: # 2D or 3D case + flat = vec.tolist() + + # First and last slices + first = flat[:max_vals] + last = flat[-max_vals:] if len(flat) >= max_vals else flat + first_str = ", ".join(f"{v:12.4f}" for v in first) + last_str = ", ".join(f"{v:12.4f}" for v in last) + + print(f" [{first_str}, ..., {last_str}]") + + print(" ],") + print(" ]") + print(f" sum = {t.sum().item():.6f}\n") + + +def debug_hook(name): + def fn(_m, input, output): + if isinstance(input, torch.Tensor): + summarize(input, name + "_in") + elif isinstance(input, (tuple, list)) and len(input) > 0 and isinstance(input[0], torch.Tensor): + summarize(input[0], name + "_in") + if isinstance(output, torch.Tensor): + summarize(output, name + "_out") + elif isinstance(output, (tuple, list)) and len(output) > 0 and isinstance(output[0], torch.Tensor): + summarize(output[0], name + "_out") + + return fn + + +def setup_rope_debug(model_module_path: str, function_name: str = "apply_rotary_pos_emb"): + """ + Apply monkey patch to dump RoPE activations for debugging. + + Args: + model_module_path: Path to the model module (e.g., "transformers.models.apertus.modeling_apertus") + function_name: Name of the RoPE function to patch (default: "apply_rotary_pos_emb") + + Example: + from utils.common import setup_rope_debug + setup_rope_debug("transformers.models.apertus.modeling_apertus") + """ + import importlib + + # Import the module and get the original function + module = importlib.import_module(model_module_path) + orig_rope = getattr(module, function_name) + + # Set torch print options for better debugging + torch.set_printoptions(threshold=float('inf')) + torch.set_printoptions(precision=6, sci_mode=False) + + def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + # log inputs + summarize(q, "RoPE.q_in") + summarize(k, "RoPE.k_in") + + # call original + q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim) + + # log outputs + summarize(q_out, "RoPE.q_out") + summarize(k_out, "RoPE.k_out") + + return q_out, k_out + + # Patch it + setattr(module, function_name, debug_rope) + print(f"RoPE debug patching applied to {model_module_path}.{function_name}") + + +def save_output_data(data, tokens, prompt, model_name, type_suffix="", output_dir="data"): + """ + Save output data (logits/embeddings), tokens, and prompt to files. + + Args: + data: numpy array of floats (logits or embeddings) + tokens: list or array of token IDs + prompt: string containing the input prompt + model_name: name of the model + type_suffix: optional suffix like "-embeddings" (default: "") + output_dir: directory to save files (default: "data") + + Creates the following files in output_dir: + - pytorch-{model_name}{type_suffix}.bin + - pytorch-{model_name}{type_suffix}.txt + - pytorch-{model_name}{type_suffix}-prompt.txt + - pytorch-{model_name}{type_suffix}-tokens.bin + """ + data_dir = Path(output_dir) + data_dir.mkdir(exist_ok=True) + base_path = data_dir / f"pytorch-{model_name}{type_suffix}" + + # Convert and flatten logits/embeddings + data = data.cpu().numpy() if isinstance(data, torch.Tensor) else np.asarray(data) + data = data.flatten() if data.ndim > 1 else data + + # Save logits/embedding files + data.astype(np.float32).tofile(f"{base_path}.bin") + print(f"Data saved to {base_path}.bin") + + with open(f"{base_path}.txt", "w") as f: + f.writelines(f"{i}: {value:.6f}\n" for i, value in enumerate(data)) + print(f"Data saved to {base_path}.txt") + + # Convert and flatten tokens + tokens = tokens.cpu().numpy() if isinstance(tokens, torch.Tensor) else np.asarray(tokens) + tokens = tokens.flatten() if tokens.ndim > 1 else tokens + + # Save token binary file + tokens.astype(np.int32).tofile(f"{base_path}-tokens.bin") + print(f"Tokens saved to {base_path}-tokens.bin") + + # Save prompt file + with open(f"{base_path}-prompt.txt", "w") as f: + f.write(f"prompt: {prompt}\n") + f.write(f"n_tokens: {len(tokens)}\n") + f.write(f"token ids: {', '.join(str(int(tid)) for tid in tokens)}\n") + print(f"Prompt saved to {base_path}-prompt.txt") + + +def compare_tokens(original, converted, type_suffix="", output_dir="data"): + data_dir = Path(output_dir) + + # Read tokens from both models + tokens1_file = data_dir / f"{original}{type_suffix}-tokens.bin" + tokens2_file = data_dir / f"{converted}{type_suffix}-tokens.bin" + + if not tokens1_file.exists(): + print(f"Error: Token file not found: {tokens1_file}") + return False + + if not tokens2_file.exists(): + print(f"Error: Token file not found: {tokens2_file}") + return False + + tokens1 = np.fromfile(tokens1_file, dtype=np.int32) + tokens2 = np.fromfile(tokens2_file, dtype=np.int32) + + print(f"\nComparing tokens between:") + print(f" Original : {original} ({len(tokens1)} tokens)") + print(f" Converted: {converted} ({len(tokens2)} tokens)") + + if len(tokens1) != len(tokens2): + print(f"\nā Token count mismatch: {len(tokens1)} vs {len(tokens2)}") + return False + + if np.array_equal(tokens1, tokens2): + print(f"\nā
All {len(tokens1)} tokens match!") + return True + + mismatches = np.where(tokens1 != tokens2)[0] + print(f"\nā Found {len(mismatches)} mismatched tokens:") + + num_to_show = min(len(mismatches), 10) + for idx in mismatches[:num_to_show]: + print(f" Position {idx}: {tokens1[idx]} vs {tokens2[idx]}") + + if len(mismatches) > num_to_show: + print(f" ... and {len(mismatches) - num_to_show} more mismatches") + + return False + + +def show_version_warning(current_version, model_version): + if not model_version: + return False + + try: + from packaging.version import parse, InvalidVersion + try: + return parse(current_version) < parse(model_version) + except InvalidVersion: + return current_version != model_version + except ImportError: + return current_version != model_version + +def get_model_transformers_version(model_path): + if not model_path: + return None + + config_path = Path(model_path) / "config.json" + if not config_path.is_file(): + return None + + try: + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + return config.get("transformers_version") + except (IOError, json.JSONDecodeError) as e: + print(f"Warning: Could not read or parse {config_path}: {e}", file=sys.stderr) + return None + +def exit_with_warning(message, model_path): + print(message) + + if model_path and transformers is not None: + model_transformers_version = get_model_transformers_version(model_path) + transformers_version = transformers.__version__ + if show_version_warning(transformers_version, model_transformers_version): + warning_message = f""" + ===================================================================== + Verification failure might be due to a transformers version mismatch: + + Current transformers version: {transformers_version} + Model's required version : {model_transformers_version} + + Consider installing the version specified by the model's config: + pip install transformers=={model_transformers_version} + ===================================================================== + """ + print(textwrap.dedent(warning_message)) + sys.exit(1) diff --git a/llama.cpp/examples/model-conversion/scripts/utils/compare_tokens.py b/llama.cpp/examples/model-conversion/scripts/utils/compare_tokens.py new file mode 100755 index 0000000..a286cb5 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/compare_tokens.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +import argparse +import sys +from common import compare_tokens # type: ignore + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description='Compare tokens between two models', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s pytorch-gemma-3-270m-it llamacpp-gemma-3-270m-it-bf16 + """ + ) + parser.add_argument( + 'original', + help='Original model name' + ) + parser.add_argument( + 'converted', + help='Converted model name' + ) + parser.add_argument( + '-s', '--suffix', + default='', + help='Type suffix (e.g., "-embeddings")' + ) + parser.add_argument( + '-d', '--data-dir', + default='data', + help='Directory containing token files (default: data)' + ) + parser.add_argument( + '-v', '--verbose', + action='store_true', + help='Print prompts from both models' + ) + return parser.parse_args() + + +def main(): + args = parse_arguments() + + if args.verbose: + from pathlib import Path + data_dir = Path(args.data_dir) + + prompt1_file = data_dir / f"{args.original}{args.suffix}-prompt.txt" + prompt2_file = data_dir / f"{args.converted}{args.suffix}-prompt.txt" + + if prompt1_file.exists(): + print(f"\nOriginal model prompt ({args.original}):") + print(f" {prompt1_file.read_text().strip()}") + + if prompt2_file.exists(): + print(f"\nConverted model prompt ({args.converted}):") + print(f" {prompt2_file.read_text().strip()}") + + print() + + result = compare_tokens( + args.original, + args.converted, + type_suffix=args.suffix, + output_dir=args.data_dir + ) + + # Enable the script to be used in shell scripts so that they can check + # the exit code for success/failure. + sys.exit(0 if result else 1) + + +if __name__ == "__main__": + main() diff --git a/llama.cpp/examples/model-conversion/scripts/utils/create-collection-add-model.sh b/llama.cpp/examples/model-conversion/scripts/utils/create-collection-add-model.sh new file mode 100644 index 0000000..485001b --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/create-collection-add-model.sh @@ -0,0 +1,8 @@ + +#!/usr/bin/env bash + +COLLECTION_SLUG=$(python ./create_collection.py --return-slug) +echo "Created collection: $COLLECTION_SLUG" + +# Use it in the next command +python add_model_to_collection.py "$COLLECTION_SLUG" "username/my-model" diff --git a/llama.cpp/examples/model-conversion/scripts/utils/curl-embedding-server.sh b/llama.cpp/examples/model-conversion/scripts/utils/curl-embedding-server.sh new file mode 100755 index 0000000..7ed69e1 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/curl-embedding-server.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +curl --request POST \ + --url http://localhost:8080/embedding \ + --header "Content-Type: application/json" \ + --data '{"input": "Hello world today"}' \ + --silent diff --git a/llama.cpp/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py b/llama.cpp/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py new file mode 100755 index 0000000..7e38af3 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/hf-add-model-to-collection.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 + +from huggingface_hub import HfApi +import argparse +import sys + +def add_model_to_collection(collection_slug, model_id, note=""): + """ + Add a model to an existing collection + + Args: + collection_slug: The slug of the collection (e.g., "username/collection-name-12345") + model_id: The model repository ID (e.g., "username/model-name") + note: Optional note about the model + + Returns: + True if successful, False if failed + """ + + # Initialize API + api = HfApi() + + try: + user_info = api.whoami() + print(f"ā
Authenticated as: {user_info['name']}") + + # Verify the model exists + print(f"š Checking if model exists: {model_id}") + try: + model_info = api.model_info(model_id) + except Exception as e: + print(f"ā Model not found or not accessible: {model_id}") + print(f"Error: {e}") + return False + + print(f"š Adding model to collection...") + api.add_collection_item( + collection_slug=collection_slug, + item_id=model_id, + item_type="model", + note=note + ) + + print(f"ā
Model added to collection successfully!") + print(f"š Collection URL: https://huggingface.co/collections/{collection_slug}") + + return True + + except Exception as e: + print(f"ā Error adding model to collection: {e}") + return False + +def main(): + # This script requires that the environment variable HF_TOKEN is set with your + # Hugging Face API token. + api = HfApi() + + parser = argparse.ArgumentParser(description='Add model to a Huggingface Collection') + parser.add_argument('--collection', '-c', help='The collection slug username/collection-hash', required=True) + parser.add_argument('--model', '-m', help='The model to add to the Collection', required=True) + parser.add_argument('--note', '-n', help='An optional note/description', required=False) + args = parser.parse_args() + + collection = args.collection + model = args.model + note = args.note + + success = add_model_to_collection( + collection_slug=collection, + model_id=model, + note=note + ) + + if success: + print("\nš Model added successfully!") + else: + print("\nā Failed to add model to collection") + sys.exit(1) +if __name__ == "__main__": + main() diff --git a/llama.cpp/examples/model-conversion/scripts/utils/hf-create-collection.py b/llama.cpp/examples/model-conversion/scripts/utils/hf-create-collection.py new file mode 100755 index 0000000..e0fa60a --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/hf-create-collection.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 + +from huggingface_hub import HfApi +import argparse +import os +import sys + + +def create_collection(title, description, private=False, namespace=None, return_slug=False): + """ + Create a new collection on Hugging Face + + Args: + title: Collection title + description: Collection description + private: Whether the collection should be private (default: False) + namespace: Optional namespace (defaults to your username) + + Returns: + Collection object if successful, None if failed + """ + + # Check if HF_TOKEN is available + token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") + if not token: + print("ā No HF_TOKEN or HUGGINGFACE_HUB_TOKEN found in environment variables") + print("Please set your Hugging Face token as an environment variable") + return None + + # Initialize API + api = HfApi() + + try: + # Test authentication first + user_info = api.whoami() + if not return_slug: + print(f"ā
Authenticated as: {user_info['name']}") + + # Create the collection + if not return_slug: + print(f"š Creating collection: '{title}'...") + collection = api.create_collection( + title=title, + description=description, + private=private, + namespace=namespace + ) + + if not return_slug: + print(f"ā
Collection created successfully!") + print(f"š Collection slug: {collection.slug}") + print(f"š Collection URL: https://huggingface.co/collections/{collection.slug}") + + return collection + + except Exception as e: + print(f"ā Error creating collection: {e}") + return None + +def main(): + # This script requires that the environment variable HF_TOKEN is set with your + # Hugging Face API token. + api = HfApi() + + parser = argparse.ArgumentParser(description='Create a Huggingface Collection') + parser.add_argument('--name', '-n', help='The name/title of the Collection', required=True) + parser.add_argument('--description', '-d', help='The description for the Collection', required=True) + parser.add_argument('--namespace', '-ns', help='The namespace to add the Collection to', required=True) + parser.add_argument('--private', '-p', help='Create a private Collection', action='store_true') # Fixed + parser.add_argument('--return-slug', '-s', help='Only output the collection slug', action='store_true') # Fixed + + args = parser.parse_args() + + name = args.name + description = args.description + private = args.private + namespace = args.namespace + return_slug = args.return_slug + + if not return_slug: + print("š Creating Hugging Face Collection") + print(f"Title: {name}") + print(f"Description: {description}") + print(f"Namespace: {namespace}") + print(f"Private: {private}") + + collection = create_collection( + title=name, + description=description, + private=private, + namespace=namespace, + return_slug=return_slug + ) + + if collection: + if return_slug: + print(collection.slug) + else: + print("\nš Collection created successfully!") + print(f"Use this slug to add models: {collection.slug}") + else: + print("\nā Failed to create collection") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/llama.cpp/examples/model-conversion/scripts/utils/hf-create-model.py b/llama.cpp/examples/model-conversion/scripts/utils/hf-create-model.py new file mode 100755 index 0000000..ea99bd8 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/hf-create-model.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +from huggingface_hub import HfApi +import argparse + +# This script requires that the environment variable HF_TOKEN is set with your +# Hugging Face API token. +api = HfApi() + +def load_template_and_substitute(template_path, **kwargs): + try: + with open(template_path, 'r', encoding='utf-8') as f: + template_content = f.read() + + return template_content.format(**kwargs) + except FileNotFoundError: + print(f"Template file '{template_path}' not found!") + return None + except KeyError as e: + print(f"Missing template variable: {e}") + return None + +parser = argparse.ArgumentParser(description='Create a new Hugging Face model repository') +parser.add_argument('--model-name', '-m', help='Name for the model', required=True) +parser.add_argument('--namespace', '-ns', help='Namespace to add the model to', required=True) +parser.add_argument('--org-base-model', '-b', help='Original Base model name', default="") +parser.add_argument('--no-card', action='store_true', help='Skip creating model card') +parser.add_argument('--private', '-p', action='store_true', help='Create private model') +parser.add_argument('--embedding', '-e', action='store_true', help='Use embedding model card template') +parser.add_argument('--dry-run', '-d', action='store_true', help='Print repository info and template without creating repository') + +args = parser.parse_args() + +repo_id = f"{args.namespace}/{args.model_name}-GGUF" +print("Repository ID: ", repo_id) + +repo_url = None +if not args.dry_run: + repo_url = api.create_repo( + repo_id=repo_id, + repo_type="model", + private=args.private, + exist_ok=False + ) + +if not args.no_card: + if args.embedding: + template_path = "scripts/embedding/modelcard.template" + else: + template_path = "scripts/causal/modelcard.template" + + print("Template path: ", template_path) + + model_card_content = load_template_and_substitute( + template_path, + model_name=args.model_name, + namespace=args.namespace, + base_model=args.org_base_model, + ) + + if args.dry_run: + print("\nTemplate Content:\n") + print(model_card_content) + else: + if model_card_content: + api.upload_file( + path_or_fileobj=model_card_content.encode('utf-8'), + path_in_repo="README.md", + repo_id=repo_id + ) + print("Model card created successfully.") + else: + print("Failed to create model card.") + +if not args.dry_run and repo_url: + print(f"Repository created: {repo_url}") + + diff --git a/llama.cpp/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py b/llama.cpp/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py new file mode 100755 index 0000000..15ccb11 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/hf-upload-gguf-model.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 + +from huggingface_hub import HfApi +import argparse +import os + +def upload_gguf_file(local_file_path, repo_id, filename_in_repo=None): + """ + Upload a GGUF file to a Hugging Face model repository + + Args: + local_file_path: Path to your local GGUF file + repo_id: Your repository ID (e.g., "username/model-name") + filename_in_repo: Optional custom name for the file in the repo + """ + + if not os.path.exists(local_file_path): + print(f"ā File not found: {local_file_path}") + return False + + if filename_in_repo is None: + filename_in_repo = os.path.basename(local_file_path) + + if filename_in_repo is None or filename_in_repo == "": + filename_in_repo = os.path.basename(local_file_path) + + print(f"š¤ Uploading {local_file_path} to {repo_id}/{filename_in_repo}") + + api = HfApi() + + try: + api.upload_file( + path_or_fileobj=local_file_path, + path_in_repo=filename_in_repo, + repo_id=repo_id, + repo_type="model", + commit_message=f"Upload {filename_in_repo}" + ) + + print("ā
Upload successful!") + print(f"š File available at: https://huggingface.co/{repo_id}/blob/main/{filename_in_repo}") + return True + + except Exception as e: + print(f"ā Upload failed: {e}") + return False + +# This script requires that the environment variable HF_TOKEN is set with your +# Hugging Face API token. +api = HfApi() + +parser = argparse.ArgumentParser(description='Upload a GGUF model to a Huggingface model repository') +parser.add_argument('--gguf-model-path', '-m', help='The GGUF model file to upload', required=True) +parser.add_argument('--repo-id', '-r', help='The repository to upload to', required=True) +parser.add_argument('--name', '-o', help='The name in the model repository', required=False) +args = parser.parse_args() + +upload_gguf_file(args.gguf_model_path, args.repo_id, args.name) diff --git a/llama.cpp/examples/model-conversion/scripts/utils/inspect-converted-model.sh b/llama.cpp/examples/model-conversion/scripts/utils/inspect-converted-model.sh new file mode 100755 index 0000000..32d8482 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/inspect-converted-model.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# First try command line argument, then environment variable, then file +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +../../gguf-py/gguf/scripts/gguf_dump.py $CONVERTED_MODEL diff --git a/llama.cpp/examples/model-conversion/scripts/utils/inspect-org-model.py b/llama.cpp/examples/model-conversion/scripts/utils/inspect-org-model.py new file mode 100755 index 0000000..bc6f45a --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/inspect-org-model.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +import argparse +import os +import json +from safetensors import safe_open +from collections import defaultdict + +parser = argparse.ArgumentParser(description='Process model with specified path') +parser.add_argument('--model-path', '-m', help='Path to the model') +args = parser.parse_args() + +model_path = os.environ.get('MODEL_PATH', args.model_path) +if model_path is None: + parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable") + +# Check if there's an index file (multi-file model) +index_path = os.path.join(model_path, "model.safetensors.index.json") +single_file_path = os.path.join(model_path, "model.safetensors") + +if os.path.exists(index_path): + # Multi-file model + print("Multi-file model detected") + + with open(index_path, 'r') as f: + index_data = json.load(f) + + # Get the weight map (tensor_name -> file_name) + weight_map = index_data.get("weight_map", {}) + + # Group tensors by file for efficient processing + file_tensors = defaultdict(list) + for tensor_name, file_name in weight_map.items(): + file_tensors[file_name].append(tensor_name) + + print("Tensors in model:") + + # Process each shard file + for file_name, tensor_names in file_tensors.items(): + file_path = os.path.join(model_path, file_name) + print(f"\n--- From {file_name} ---") + + with safe_open(file_path, framework="pt") as f: + for tensor_name in sorted(tensor_names): + tensor = f.get_tensor(tensor_name) + print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}") + +elif os.path.exists(single_file_path): + # Single file model (original behavior) + print("Single-file model detected") + + with safe_open(single_file_path, framework="pt") as f: + keys = f.keys() + print("Tensors in model:") + for key in sorted(keys): + tensor = f.get_tensor(key) + print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}") + +else: + print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}") + print("Available files:") + if os.path.exists(model_path): + for item in sorted(os.listdir(model_path)): + print(f" {item}") + else: + print(f" Directory {model_path} does not exist") + exit(1) diff --git a/llama.cpp/examples/model-conversion/scripts/utils/perplexity-gen.sh b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-gen.sh new file mode 100755 index 0000000..ef4b650 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-gen.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +set -e + +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" +BUILD_DIR="${2:-"$BUILD_DIR"}" + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +# Check if data/wikitext-2-raw directory exists +if [ ! -d "ppl/wikitext-2-raw" ]; then + echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2 + mkdir -p ppl + pushd ppl + ./../../../scripts/get-wikitext-2.sh + popd +fi + +mkdir -p ppl +OUTPUTFILE="ppl/$(basename $CONVERTED_MODEL).kld" +echo "Model: $CONVERTED_MODEL" + +if [ -z "$BUILD_DIR" ]; then + BUILD_DIR="../../build" +fi + +cmake --build $BUILD_DIR --target llama-perplexity -j8 + +${BUILD_DIR}/bin/llama-perplexity -m $CONVERTED_MODEL \ + -f ppl/wikitext-2-raw/wiki.test.raw \ + --kl-divergence-base $OUTPUTFILE + +echo "Generated logits in $OUTPUTFILE" + diff --git a/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run-simple.sh b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run-simple.sh new file mode 100755 index 0000000..20ee965 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run-simple.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +set -e + +QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}" +BUILD_DIR="${2:-"$BUILD_DIR"}" + +if [ -z "$QUANTIZED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. QUANTIZED_MODEL environment variable" >&2 + exit 1 +fi + +# Check if data/wikitext-2-raw directory exists +if [ ! -d "ppl/wikitext-2-raw" ]; then + echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2 + mkdir -p ppl + pushd ppl + ./../../../scripts/get-wikitext-2.sh + popd +fi + +if [ -z "$BUILD_DIR" ]; then + BUILD_DIR="../../build" +fi + +cmake --build $BUILD_DIR --target llama-perplexity -j8 + +${BUILD_DIR}/bin/llama-perplexity -m $QUANTIZED_MODEL -f ppl/wikitext-2-raw/wiki.test.raw + + diff --git a/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run.sh b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run.sh new file mode 100755 index 0000000..c11f32c --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/perplexity-run.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +set -e + +QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}" +LOGITS_FILE="${2:-"$LOGITS_FILE"}" +BUILD_DIR="${3:-"$BUILD_DIR"}" + +if [ -z "$QUANTIZED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. QUANTIZED_MODEL environment variable" >&2 + exit 1 +fi + +if [ ! -f ${LOGITS_FILE} ]; then + echo "Error: logits file '${LOGITS_FILE} was not found" + echo "Did you run the perplexity-gen.sh script?" + exit 1 +fi + +if [ -z "$BUILD_DIR" ]; then + BUILD_DIR="../../build" +fi + +echo "Model: $QUANTIZED_MODEL" +echo "Data file: $LOGITS_FILE" + +cmake --build $BUILD_DIR --target llama-perplexity -j8 + +${BUILD_DIR}/bin/llama-perplexity -m $QUANTIZED_MODEL \ + --kl-divergence-base $LOGITS_FILE \ + --kl-divergence diff --git a/llama.cpp/examples/model-conversion/scripts/utils/quantize.sh b/llama.cpp/examples/model-conversion/scripts/utils/quantize.sh new file mode 100755 index 0000000..4c21a13 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/quantize.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +set -e + +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" +QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}" +TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}" +OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}" +BUILD_DIR="${5:-"$BUILD_DIR"}" +QUANTIZED_MODEL=$CONVERTED_MODEL + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +if [ -z "$QUANTIZED_TYPE" ]; then + echo "Error: QUANTIZED_TYPE is required" >&2 + exit 1 +fi + +echo $CONVERTED_MODEL + +# Process the quantized model filename +if [[ "$QUANTIZED_MODEL" == *.gguf ]]; then + # Remove .gguf suffix, add quantized type, then add .gguf back + BASE_NAME="${QUANTIZED_MODEL%.gguf}" + QUANTIZED_MODEL="${BASE_NAME}-${QUANTIZED_TYPE}.gguf" +else + echo "Error: QUANTIZED_MODEL must end with .gguf extension" >&2 + exit 1 +fi + +if [ -z "$BUILD_DIR" ]; then + BUILD_DIR="../../build" +fi + +cmake --build $BUILD_DIR --target llama-quantize -j8 + +echo $TOKEN_EMBD_TYPE +echo $OUTPUT_TYPE + +CMD_ARGS=("${BUILD_DIR}/bin/llama-quantize") +[[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE") +[[ -n "$OUTPUT_TYPE" ]] && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE") +CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE") + +"${CMD_ARGS[@]}" + +echo "Quantized model saved to: $QUANTIZED_MODEL" diff --git a/llama.cpp/examples/model-conversion/scripts/utils/run-embedding-server.sh b/llama.cpp/examples/model-conversion/scripts/utils/run-embedding-server.sh new file mode 100755 index 0000000..9f5fc2c --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/run-embedding-server.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -e +# +# First try command line argument, then environment variable, then file +CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" +BUILD_DIR="${2:-"$BUILD_DIR"}" + +# Final check if we have a model path +if [ -z "$CONVERTED_MODEL" ]; then + echo "Error: Model path must be provided either as:" >&2 + echo " 1. Command line argument" >&2 + echo " 2. CONVERTED_MODEL environment variable" >&2 + exit 1 +fi + +if [ -z "$BUILD_DIR" ]; then + BUILD_DIR="../../build" +fi + +echo $CONVERTED_MODEL + +cmake --build $BUILD_DIR --target llama-server + +${BUILD_DIR}/bin/llama-server -m $CONVERTED_MODEL \ + --embedding \ + --pooling none diff --git a/llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py b/llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py new file mode 100644 index 0000000..73e20ea --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/semantic_check.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 + +import numpy as np +import argparse +import os +import importlib +from pathlib import Path + +from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel +from common import compare_tokens, exit_with_warning # type: ignore[import-not-found] + +unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME') + +def cosine_similarity(a, b=None): + a = np.asarray(a) + if b is None: + b = a + else: + b = np.asarray(b) + + if a.ndim == 1: + a = a.reshape(1, -1) + if b.ndim == 1: + b = b.reshape(1, -1) + + a_norms = np.linalg.norm(a, axis=1, keepdims=True) + b_norms = np.linalg.norm(b, axis=1, keepdims=True) + + a_norms = np.where(a_norms == 0, 1e-8, a_norms) + b_norms = np.where(b_norms == 0, 1e-8, b_norms) + + a_normalized = a / a_norms + b_normalized = b / b_norms + + # Compute cosine similarity + return np.dot(a_normalized, b_normalized.T) + +def load_embeddings_from_file(filename, n_tokens, n_embd): + embeddings = np.fromfile(filename, dtype=np.float32) + # Check if this is pooled (single embedding) or per-token embeddings + if len(embeddings) == n_embd: + return embeddings.reshape(1, n_embd) + else: + return embeddings.reshape(n_tokens, n_embd) + +def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt): + np.set_printoptions(suppress=True, precision=6) + print("pytorch embeddings:"); + print(python_emb) + print("llama.cpp embeddings:"); + print(cpp_emb) + print(f"\n=== Prompt: '{prompt}' ===") + print(f"Tokens: {tokens}") + print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}") + + n_tokens = len(tokens) + is_pooled = python_emb.shape[0] == 1 + + if is_pooled: + print(f"\n[Pooled Embeddings Mode - comparing single sentence embeddings]") + + # 1. Direct embedding comparison for pooled embeddings + print(f"\n1. Raw Embedding Magnitude Comparison:") + py_mag = np.linalg.norm(python_emb[0]) + cpp_mag = np.linalg.norm(cpp_emb[0]) + ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf') + print(f" Pooled embedding: Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}") + + # 2. Cross-model similarity for pooled embeddings + print(f"\n2. Cross-Model Pooled Embedding Similarity:") + sim = cosine_similarity([python_emb[0]], [cpp_emb[0]])[0][0] + print(f" Cosine similarity: {sim:.6f}") + + return { + 'cross_model_similarities': [sim], + 'similarity_matrix_diff': np.array([[0.0]]), + 'max_diff': 0.0, + 'mean_diff': 0.0, + 'rms_diff': 0.0 + } + else: + # Original per-token comparison logic + # 1. Direct embedding comparison + print(f"\n1. Raw Embedding Magnitude Comparison:") + # Check if the distance of each token embedding from the origin and compare + # if the vectors are on the same "sphere". This does not tell us about + # direction (meaning of the token embedding), just magnitude. + for i in range(n_tokens): + py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings + cpp_mag = np.linalg.norm(cpp_emb[i]) # calculate standard euclidean norm for llama.cpp embeddings + ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf') + print(f" Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}") + + # 2. Cosine similarity between tokens within each model + # Here we check the direction of token embeddings to see if the have the + # same meaning (similarity). This is done by calculating cosine similarity + # of a pair of token embeddings within each model. + print(f"\n2. Within-Model Token Similarities:") + print(" Python model:") + for i in range(n_tokens): + for j in range(i+1, n_tokens): + sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0] + print(f" {tokens[i]} ā {tokens[j]}: {sim:.4f}") + + print(" llama.cpp model:") + for i in range(n_tokens): + for j in range(i+1, n_tokens): + sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0] + print(f" {tokens[i]} ā {tokens[j]}: {sim:.4f}") + + # 3. Cross-model similarity (same token position) + print(f"\n3. Cross-Model Same-Token Similarities:") + for i in range(n_tokens): + sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] + print(f" Token {i} ({tokens[i]}): {sim:.4f}") + + # 4. Similarity matrix comparison + print(f"\n4. Similarity Matrix Differences:") + py_sim_matrix = cosine_similarity(python_emb) + cpp_sim_matrix = cosine_similarity(cpp_emb) + diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix) + + print(f" Max difference: {np.max(diff_matrix):.4f}") + print(f" Mean difference: {np.mean(diff_matrix):.4f}") + print(f" RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}") + + return { + 'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)], + 'similarity_matrix_diff': diff_matrix, + 'max_diff': np.max(diff_matrix), + 'mean_diff': np.mean(diff_matrix), + 'rms_diff': np.sqrt(np.mean(diff_matrix**2)) + } + +def read_prompt_from_file(file_path): + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read().strip() + except FileNotFoundError: + print(f"Error: Prompts file '{file_path}' not found") + exit(1) + except Exception as e: + print(f"Error reading prompts file: {e}") + exit(1) + +def main(): + parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings') + parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model') + parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file') + parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file') + parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true') + parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt') + parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts') + + args = parser.parse_args() + + if args.prompts_file: + prompt = read_prompt_from_file(args.prompts_file) + else: + prompt = args.prompt + + python_emb_path = Path(args.python_embeddings) + cpp_emb_path = Path(args.cpp_embeddings) + + # Extract base names (e.g., "pytorch-model-name-embeddings.bin" -> "pytorch-model-name") + python_model_name = python_emb_path.stem.replace("-embeddings", "") + cpp_model_name = cpp_emb_path.stem.replace("-embeddings", "") + + print("Semantic Similarity Test Between Python and llama.cpp Embedding Models") + print("=" * 70) + + # First verify tokens match before comparing embeddings + print("\nš Token Comparison Check") + print("=" * 70) + data_dir = python_emb_path.parent + if not compare_tokens(python_model_name, cpp_model_name, type_suffix="-embeddings", output_dir=str(data_dir)): + exit_with_warning("\nā Token mismatch detected", args.model_path) + print() + + # Single prompt detailed comparison + print(f"\nTesting with prompt: '{prompt}'") + + # Load the python model to get configuration information and also to load the tokenizer. + print("Loading model and tokenizer using AutoTokenizer:", args.model_path) + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True) + + if unreleased_model_name: + model_name_lower = unreleased_model_name.lower() + unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}" + if args.causal: + class_name = f"{unreleased_model_name}ForCausalLM" + else: + class_name = f"{unreleased_model_name}Model" + print(f"Model class: {class_name}") + print(f"Importing unreleased model module: {unreleased_module_path}") + + try: + model_class = getattr(importlib.import_module(unreleased_module_path), class_name) + model = model_class.from_pretrained(args.model_path) + except (ImportError, AttributeError) as e: + print(f"Failed to import or load model: {e}") + exit(1) + else: + if args.causal: + model = AutoModelForCausalLM.from_pretrained(args.model_path, trust_remote_code=True) + else: + model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True) + + encoded = tokenizer(prompt, return_tensors="pt") + tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0]) + n_tokens = len(tokens) + print(f"n_tokens: {n_tokens}"); + print(f"hidden_size: {model.config.hidden_size}") + + # Load binary embeddings from data directory. + llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size) + python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size) + + # Run comparison + results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt) + + # Summary + print(f"\n=== SUMMARY ===") + avg_cross_sim = np.mean(results['cross_model_similarities']) + print(f"Average cross-model similarity: {avg_cross_sim:.4f}") + print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}") + + # Quality assessment + if avg_cross_sim > 0.95: + print("ā
EXCELLENT: Models are highly similar") + elif avg_cross_sim > 0.90: + print("ā
VERY GOOD: Models are very similar") + elif avg_cross_sim > 0.80: + print("ā ļø GOOD: Models are reasonably similar") + elif avg_cross_sim > 0.70: + print("ā ļø FAIR: Models have some differences") + else: + exit_with_warning("ā POOR: Models are significantly different", args.model_path) + +if __name__ == "__main__": + main() diff --git a/llama.cpp/examples/model-conversion/scripts/utils/tensor-info.py b/llama.cpp/examples/model-conversion/scripts/utils/tensor-info.py new file mode 100755 index 0000000..12a3430 --- /dev/null +++ b/llama.cpp/examples/model-conversion/scripts/utils/tensor-info.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import re +import sys +from pathlib import Path +from typing import Optional +from safetensors import safe_open + + +MODEL_SAFETENSORS_FILE = "model.safetensors" +MODEL_SAFETENSORS_INDEX = "model.safetensors.index.json" + + +def get_weight_map(model_path: Path) -> Optional[dict[str, str]]: + index_file = model_path / MODEL_SAFETENSORS_INDEX + + if index_file.exists(): + with open(index_file, 'r') as f: + index = json.load(f) + return index.get("weight_map", {}) + + return None + + +def get_all_tensor_names(model_path: Path) -> list[str]: + weight_map = get_weight_map(model_path) + + if weight_map is not None: + return list(weight_map.keys()) + + single_file = model_path / MODEL_SAFETENSORS_FILE + if single_file.exists(): + try: + with safe_open(single_file, framework="pt", device="cpu") as f: + return list(f.keys()) + except Exception as e: + print(f"Error reading {single_file}: {e}") + sys.exit(1) + + print(f"Error: No safetensors files found in {model_path}") + sys.exit(1) + + +def find_tensor_file(model_path: Path, tensor_name: str) -> Optional[str]: + weight_map = get_weight_map(model_path) + + if weight_map is not None: + return weight_map.get(tensor_name) + + single_file = model_path / MODEL_SAFETENSORS_FILE + if single_file.exists(): + return single_file.name + + return None + + +def normalize_tensor_name(tensor_name: str) -> str: + normalized = re.sub(r'\.\d+\.', '.#.', tensor_name) + normalized = re.sub(r'\.\d+$', '.#', normalized) + return normalized + + +def list_all_tensors(model_path: Path, unique: bool = False): + tensor_names = get_all_tensor_names(model_path) + + if unique: + seen = set() + for tensor_name in sorted(tensor_names): + normalized = normalize_tensor_name(tensor_name) + if normalized not in seen: + seen.add(normalized) + print(normalized) + else: + for tensor_name in sorted(tensor_names): + print(tensor_name) + + +def print_tensor_info(model_path: Path, tensor_name: str): + tensor_file = find_tensor_file(model_path, tensor_name) + + if tensor_file is None: + print(f"Error: Could not find tensor '{tensor_name}' in model index") + print(f"Model path: {model_path}") + sys.exit(1) + + file_path = model_path / tensor_file + + try: + with safe_open(file_path, framework="pt", device="cpu") as f: + if tensor_name in f.keys(): + tensor_slice = f.get_slice(tensor_name) + shape = tensor_slice.get_shape() + print(f"Tensor: {tensor_name}") + print(f"File: {tensor_file}") + print(f"Shape: {shape}") + else: + print(f"Error: Tensor '{tensor_name}' not found in {tensor_file}") + sys.exit(1) + + except FileNotFoundError: + print(f"Error: The file '{file_path}' was not found.") + sys.exit(1) + except Exception as e: + print(f"An error occurred: {e}") + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="Print tensor information from a safetensors model" + ) + parser.add_argument( + "tensor_name", + nargs="?", # optional (if --list is used for example) + help="Name of the tensor to inspect" + ) + parser.add_argument( + "-m", "--model-path", + type=Path, + help="Path to the model directory (default: MODEL_PATH environment variable)" + ) + parser.add_argument( + "-l", "--list", + action="store_true", + help="List unique tensor patterns in the model (layer numbers replaced with #)" + ) + + args = parser.parse_args() + + model_path = args.model_path + if model_path is None: + model_path_str = os.environ.get("MODEL_PATH") + if model_path_str is None: + print("Error: --model-path not provided and MODEL_PATH environment variable not set") + sys.exit(1) + model_path = Path(model_path_str) + + if not model_path.exists(): + print(f"Error: Model path does not exist: {model_path}") + sys.exit(1) + + if not model_path.is_dir(): + print(f"Error: Model path is not a directory: {model_path}") + sys.exit(1) + + if args.list: + list_all_tensors(model_path, unique=True) + else: + if args.tensor_name is None: + print("Error: tensor_name is required when not using --list") + sys.exit(1) + print_tensor_info(model_path, args.tensor_name) + + +if __name__ == "__main__": + main() |
