examples : add model conversion tool/example (#15455)

* examples : add model conversion tool/example

This commit adds an "example/tool" that is intended to help in the
process of converting models to GGUF. Currently it supports normal
causal models and embedding models. The readme contains instructions and
command to guide through the process.

The motivation for this to have a structured and repeatable process for
model conversions and hopefully with time improve upon it to make the
process easier and more reliable. We have started to use this for new
model conversions internally and will continue doing so and improve it
as we go along. Perhaps with time this should be placed in a different
directory than the examples directory, but for now it seems like a good
place to keep it while we are still developing it.

* squash! examples : add model conversion tool/example

Remove dependency on scikit-learn in model conversion example.

* squash! examples : add model conversion tool/example

Update transformer dep to use non-dev version. And also import
`AutoModelForCausalLM` instead of `AutoModel` to ensure compatibility
with the latest version.

* squash! examples : add model conversion tool/example

Remove the logits requirements file from the all requirements file.
This commit is contained in:
Daniel Bevenius
2025-08-21 12:16:54 +02:00
committed by GitHub
parent b108e42904
commit 2758fa10da
33 changed files with 2230 additions and 0 deletions

View File

@@ -0,0 +1,174 @@
#!/usr/bin/env python3
import numpy as np
import sys
import os
import argparse
from pathlib import Path
def calculate_nmse(reference, test):
mse = np.mean((test - reference) ** 2)
ref_var = np.var(reference)
if ref_var == 0:
nmse = float('inf') if mse > 0 else 0.0
return mse, mse, ref_var
nmse = mse / ref_var
return nmse, mse, ref_var
def load_logits(file_path):
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
if file_path.suffix == '.npy':
return np.load(file_path)
elif file_path.suffix == '.bin':
return np.fromfile(file_path, dtype=np.float32)
else:
# Try to load as text file
try:
# If it has index format "0: value", extract just values
data = []
with open(file_path, 'r') as f:
for line in f:
if ':' in line:
# Format: "index: value"
value = float(line.split(':')[1].strip())
else:
# Just the value
value = float(line.strip())
data.append(value)
return np.array(data, dtype=np.float32)
except:
return np.loadtxt(file_path, dtype=np.float32)
def interpret_nmse(nmse):
"""Provide interpretation of NMSE value"""
if nmse == 0:
return "Perfect match", "🎉"
elif nmse < 1e-6:
return "Essentially identical", ""
elif nmse < 1e-4:
return "Excellent match", ""
elif nmse < 1e-3:
return "Very good match", "👍"
elif nmse < 1e-2:
return "Good match", "👍"
elif nmse < 0.1:
return "Acceptable match", "⚠️"
elif nmse < 1.0:
return "Poor match", ""
else:
return "Very poor match (worse than noise)", ""
def main():
parser = argparse.ArgumentParser(description='Validate model logits')
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
args = parser.parse_args()
model_name = os.path.splitext(os.path.basename(args.model_path))[0]
data_dir = Path("data")
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
llamacpp_file = data_dir / f"llamacpp-{model_name}.bin"
print(f"Model name: {model_name}")
print(f"PyTorch logits file: {pytorch_file}")
print(f"llama.cpp logits file: {llamacpp_file}")
reference_file = pytorch_file
test_file = llamacpp_file
print("📊 NMSE Check for Model Comparison")
print("=" * 50)
print(f"Reference (ground truth): {reference_file}")
print(f"Test (to evaluate): {test_file}")
print()
try:
print("Loading reference logits...")
reference = load_logits(reference_file)
print(f" Shape: {reference.shape}, Type: {reference.dtype}")
print("Loading test logits...")
test = load_logits(test_file)
print(f" Shape: {test.shape}, Type: {test.dtype}")
# Check shapes match
if reference.shape != test.shape:
print(f"\n❌ Error: Shape mismatch!")
print(f" Reference: {reference.shape}")
print(f" Test: {test.shape}")
sys.exit(1)
print(f"\n✅ Shapes match: {reference.shape}")
nmse, mse, ref_var = calculate_nmse(reference, test)
# Additional metrics
max_abs_error = np.max(np.abs(test - reference))
mean_abs_error = np.mean(np.abs(test - reference))
# Results
print(f"\n📈 METRICS")
print("=" * 30)
print(f"MSE (Mean Squared Error): {mse:.6e}")
print(f"Reference Variance: {ref_var:.6e}")
print(f"NMSE: {nmse:.6e}")
print(f"Max Absolute Error: {max_abs_error:.6f}")
print(f"Mean Absolute Error: {mean_abs_error:.6f}")
# NMSE in dB (common in signal processing)
if nmse > 0:
nmse_db = 10 * np.log10(nmse)
print(f"NMSE (dB): {nmse_db:.2f} dB")
# Interpretation
interpretation, emoji = interpret_nmse(nmse)
print(f"\n🎯 INTERPRETATION")
print("=" * 30)
print(f"{emoji} {interpretation}")
# Detailed guidance
print(f"\n📋 GUIDANCE")
print("=" * 30)
if nmse < 1e-3:
print("✅ EXCELLENT: Your GGML conversion is working very well!")
print(" The differences are negligible for practical use.")
elif nmse < 1e-2:
print("👍 GOOD: Your GGML conversion is working well.")
print(" Small differences are likely due to precision/quantization.")
elif nmse < 0.1:
print("⚠️ ACCEPTABLE: Conversion is working but with some differences.")
print(" Check if you're using quantization (Q4, Q8, etc.)")
print(" Test generation quality to see if it's acceptable.")
else:
print("❌ PROBLEMATIC: Large differences detected.")
print(" Check your conversion process for potential issues.")
print(" Verify you're using the same model weights.")
# NMSE benchmarks
print(f"\n📚 NMSE BENCHMARKS")
print("=" * 30)
print("< 1e-6: Essentially identical")
print("< 1e-4: Excellent (typical for good conversions)")
print("< 1e-3: Very good")
print("< 1e-2: Good (acceptable for most use cases)")
print("< 0.1: Acceptable (may need verification)")
print("> 1.0: Poor (worse than random)")
# Exit code based on NMSE
if nmse < 1e-2:
print(f"\n✅ RESULT: PASS (NMSE = {nmse:.2e})")
sys.exit(0)
else:
print(f"\n❌ RESULT: NEEDS REVIEW (NMSE = {nmse:.2e})")
sys.exit(1)
except Exception as e:
print(f"❌ Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,6 @@
COLLECTION_SLUG=$(python ./create_collection.py --return-slug)
echo "Created collection: $COLLECTION_SLUG"
# Use it in the next command
python add_model_to_collection.py "$COLLECTION_SLUG" "username/my-model"

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
from huggingface_hub import HfApi
import argparse
import sys
def add_model_to_collection(collection_slug, model_id, note=""):
"""
Add a model to an existing collection
Args:
collection_slug: The slug of the collection (e.g., "username/collection-name-12345")
model_id: The model repository ID (e.g., "username/model-name")
note: Optional note about the model
Returns:
True if successful, False if failed
"""
# Initialize API
api = HfApi()
try:
user_info = api.whoami()
print(f"✅ Authenticated as: {user_info['name']}")
# Verify the model exists
print(f"🔍 Checking if model exists: {model_id}")
try:
model_info = api.model_info(model_id)
except Exception as e:
print(f"❌ Model not found or not accessible: {model_id}")
print(f"Error: {e}")
return False
print(f"📚 Adding model to collection...")
api.add_collection_item(
collection_slug=collection_slug,
item_id=model_id,
item_type="model",
note=note
)
print(f"✅ Model added to collection successfully!")
print(f"🔗 Collection URL: https://huggingface.co/collections/{collection_slug}")
return True
except Exception as e:
print(f"❌ Error adding model to collection: {e}")
return False
def main():
# This script requires that the environment variable HF_TOKEN is set with your
# Hugging Face API token.
api = HfApi()
parser = argparse.ArgumentParser(description='Add model to a Huggingface Collection')
parser.add_argument('--collection', '-c', help='The collection slug username/collection-hash', required=True)
parser.add_argument('--model', '-m', help='The model to add to the Collection', required=True)
parser.add_argument('--note', '-n', help='An optional note/description', required=False)
args = parser.parse_args()
collection = args.collection
model = args.model
note = args.note
success = add_model_to_collection(
collection_slug=collection,
model_id=model,
note=note
)
if success:
print("\n🎉 Model added successfully!")
else:
print("\n❌ Failed to add model to collection")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,106 @@
#!/usr/bin/env python3
from huggingface_hub import HfApi
import argparse
import os
import sys
def create_collection(title, description, private=False, namespace=None, return_slug=False):
"""
Create a new collection on Hugging Face
Args:
title: Collection title
description: Collection description
private: Whether the collection should be private (default: False)
namespace: Optional namespace (defaults to your username)
Returns:
Collection object if successful, None if failed
"""
# Check if HF_TOKEN is available
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
if not token:
print("❌ No HF_TOKEN or HUGGINGFACE_HUB_TOKEN found in environment variables")
print("Please set your Hugging Face token as an environment variable")
return None
# Initialize API
api = HfApi()
try:
# Test authentication first
user_info = api.whoami()
if not return_slug:
print(f"✅ Authenticated as: {user_info['name']}")
# Create the collection
if not return_slug:
print(f"📚 Creating collection: '{title}'...")
collection = api.create_collection(
title=title,
description=description,
private=private,
namespace=namespace
)
if not return_slug:
print(f"✅ Collection created successfully!")
print(f"📋 Collection slug: {collection.slug}")
print(f"🔗 Collection URL: https://huggingface.co/collections/{collection.slug}")
return collection
except Exception as e:
print(f"❌ Error creating collection: {e}")
return None
def main():
# This script requires that the environment variable HF_TOKEN is set with your
# Hugging Face API token.
api = HfApi()
parser = argparse.ArgumentParser(description='Create a Huggingface Collection')
parser.add_argument('--name', '-n', help='The name/title of the Collection', required=True)
parser.add_argument('--description', '-d', help='The description for the Collection', required=True)
parser.add_argument('--namespace', '-ns', help='The namespace to add the Collection to', required=True)
parser.add_argument('--private', '-p', help='Create a private Collection', action='store_true') # Fixed
parser.add_argument('--return-slug', '-s', help='Only output the collection slug', action='store_true') # Fixed
args = parser.parse_args()
name = args.name
description = args.description
private = args.private
namespace = args.namespace
return_slug = args.return_slug
if not return_slug:
print("🚀 Creating Hugging Face Collection")
print(f"Title: {name}")
print(f"Description: {description}")
print(f"Namespace: {namespace}")
print(f"Private: {private}")
collection = create_collection(
title=name,
description=description,
private=private,
namespace=namespace,
return_slug=return_slug
)
if collection:
if return_slug:
print(collection.slug)
else:
print("\n🎉 Collection created successfully!")
print(f"Use this slug to add models: {collection.slug}")
else:
print("\n❌ Failed to create collection")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,63 @@
#!/usr/bin/env python3
from huggingface_hub import HfApi
import argparse
# This script requires that the environment variable HF_TOKEN is set with your
# Hugging Face API token.
api = HfApi()
def load_template_and_substitute(template_path, **kwargs):
try:
with open(template_path, 'r', encoding='utf-8') as f:
template_content = f.read()
return template_content.format(**kwargs)
except FileNotFoundError:
print(f"Template file '{template_path}' not found!")
return None
except KeyError as e:
print(f"Missing template variable: {e}")
return None
parser = argparse.ArgumentParser(description='Create a new Hugging Face model repository')
parser.add_argument('--model-name', '-m', help='Name for the model', required=True)
parser.add_argument('--namespace', '-ns', help='Namespace to add the model to', required=True)
parser.add_argument('--org-base-model', '-b', help='Original Base model name', default="")
parser.add_argument('--no-card', action='store_true', help='Skip creating model card')
parser.add_argument('--private', '-p', action='store_true', help='Create private model')
args = parser.parse_args()
repo_id = f"{args.namespace}/{args.model_name}-GGUF"
print("Repository ID: ", repo_id)
repo_url = api.create_repo(
repo_id=repo_id,
repo_type="model",
private=args.private,
exist_ok=False
)
if not args.no_card:
template_path = "scripts/readme.md.template"
model_card_content = load_template_and_substitute(
template_path,
model_name=args.model_name,
namespace=args.namespace,
base_model=args.org_base_model,
)
if model_card_content:
api.upload_file(
path_or_fileobj=model_card_content.encode('utf-8'),
path_in_repo="README.md",
repo_id=repo_id
)
print("Model card created successfully.")
else:
print("Failed to create model card.")
print(f"Repository created: {repo_url}")

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env python3
from huggingface_hub import HfApi
import argparse
import os
def upload_gguf_file(local_file_path, repo_id, filename_in_repo=None):
"""
Upload a GGUF file to a Hugging Face model repository
Args:
local_file_path: Path to your local GGUF file
repo_id: Your repository ID (e.g., "username/model-name")
filename_in_repo: Optional custom name for the file in the repo
"""
if not os.path.exists(local_file_path):
print(f"❌ File not found: {local_file_path}")
return False
if filename_in_repo is None:
filename_in_repo = os.path.basename(local_file_path)
if filename_in_repo is None or filename_in_repo == "":
filename_in_repo = os.path.basename(local_file_path)
print(f"📤 Uploading {local_file_path} to {repo_id}/{filename_in_repo}")
api = HfApi()
try:
api.upload_file(
path_or_fileobj=local_file_path,
path_in_repo=filename_in_repo,
repo_id=repo_id,
repo_type="model",
commit_message=f"Upload {filename_in_repo}"
)
print("✅ Upload successful!")
print(f"🔗 File available at: https://huggingface.co/{repo_id}/blob/main/{filename_in_repo}")
return True
except Exception as e:
print(f"❌ Upload failed: {e}")
return False
# This script requires that the environment variable HF_TOKEN is set with your
# Hugging Face API token.
api = HfApi()
parser = argparse.ArgumentParser(description='Upload a GGUF model to a Huggingface model repository')
parser.add_argument('--gguf-model-path', '-m', help='The GGUF model file to upload', required=True)
parser.add_argument('--repo-id', '-r', help='The repository to upload to', required=True)
parser.add_argument('--name', '-o', help='The name in the model repository', required=False)
args = parser.parse_args()
upload_gguf_file(args.gguf_model_path, args.repo_id, args.name)

View File

@@ -0,0 +1,14 @@
#!/bin/bash
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
../../gguf-py/gguf/scripts/gguf_dump.py $CONVERTED_MODEL

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env python3
import argparse
import os
import json
from safetensors import safe_open
from collections import defaultdict
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
args = parser.parse_args()
model_path = os.environ.get('MODEL_PATH', args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
# Check if there's an index file (multi-file model)
index_path = os.path.join(model_path, "model.safetensors.index.json")
single_file_path = os.path.join(model_path, "model.safetensors")
if os.path.exists(index_path):
# Multi-file model
print("Multi-file model detected")
with open(index_path, 'r') as f:
index_data = json.load(f)
# Get the weight map (tensor_name -> file_name)
weight_map = index_data.get("weight_map", {})
# Group tensors by file for efficient processing
file_tensors = defaultdict(list)
for tensor_name, file_name in weight_map.items():
file_tensors[file_name].append(tensor_name)
print("Tensors in model:")
# Process each shard file
for file_name, tensor_names in file_tensors.items():
file_path = os.path.join(model_path, file_name)
print(f"\n--- From {file_name} ---")
with safe_open(file_path, framework="pt") as f:
for tensor_name in sorted(tensor_names):
tensor = f.get_tensor(tensor_name)
print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
elif os.path.exists(single_file_path):
# Single file model (original behavior)
print("Single-file model detected")
with safe_open(single_file_path, framework="pt") as f:
keys = f.keys()
print("Tensors in model:")
for key in sorted(keys):
tensor = f.get_tensor(key)
print(f"- {key} : shape = {tensor.shape}, dtype = {tensor.dtype}")
else:
print(f"Error: Neither 'model.safetensors.index.json' nor 'model.safetensors' found in {model_path}")
print("Available files:")
if os.path.exists(model_path):
for item in sorted(os.listdir(model_path)):
print(f" {item}")
else:
print(f" Directory {model_path} does not exist")
exit(1)

View File

@@ -0,0 +1,35 @@
#!/bin/bash
set -e
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
# Check if data/wikitext-2-raw directory exists
if [ ! -d "ppl/wikitext-2-raw" ]; then
echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
mkdir -p ppl
pushd ppl
./../../../scripts/get-wikitext-2.sh
popd
fi
mkdir -p ppl
OUTPUTFILE="ppl/$(basename $CONVERTED_MODEL).kld"
echo "Model: $CONVERTED_MODEL"
cmake --build ../../build --target llama-perplexity -j8
../.././build/bin/llama-perplexity -m $CONVERTED_MODEL \
-f ppl/wikitext-2-raw/wiki.test.raw \
--kl-divergence-base $OUTPUTFILE
echo "Generated logits in $OUTPUTFILE"

View File

@@ -0,0 +1,27 @@
#!/bin/bash
set -e
QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
if [ -z "$QUANTIZED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. QUANTIZED_MODEL environment variable" >&2
exit 1
fi
# Check if data/wikitext-2-raw directory exists
if [ ! -d "ppl/wikitext-2-raw" ]; then
echo "ppl/wikitext-2-raw directory does not exist. Downloading..." >&2
mkdir -p ppl
pushd ppl
./../../../scripts/get-wikitext-2.sh
popd
fi
cmake --build ../../build --target llama-perplexity -j8
../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL -f ppl/wikitext-2-raw/wiki.test.raw

View File

@@ -0,0 +1,28 @@
#!/bin/bash
set -e
QUANTIZED_MODEL="${1:-"$QUANTIZED_MODEL"}"
LOGITS_FILE="${1:-"$LOGITS_FILE"}"
if [ -z "$QUANTIZED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. QUANTIZED_MODEL environment variable" >&2
exit 1
fi
if [ ! -f ${LOGITS_FILE} ]; then
echo "Error: logits file '${LOGITS_FILE} was not found"
echo "Did you run the perplexity-gen.sh script?"
exit 1
fi
echo "Model: $QUANTIZED_MODEL"
echo "Data file: $LOGITS_FILE"
cmake --build ../../build --target llama-perplexity -j8
../.././build/bin/llama-perplexity -m $QUANTIZED_MODEL \
--kl-divergence-base $LOGITS_FILE \
--kl-divergence

View File

@@ -0,0 +1,34 @@
#!/bin/bash
set -e
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}"
QUANTIZED_MODEL=$CONVERTED_MODEL
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
echo $CONVERTED_MODEL
# Process the quantized model filename
if [[ "$QUANTIZED_MODEL" == *.gguf ]]; then
# Remove .gguf suffix, add quantized type, then add .gguf back
BASE_NAME="${QUANTIZED_MODEL%.gguf}"
QUANTIZED_MODEL="${BASE_NAME}-${QUANTIZED_TYPE}.gguf"
else
echo "Error: QUANTIZED_MODEL must end with .gguf extension" >&2
exit 1
fi
cmake --build ../../build --target llama-quantize -j8
../../build/bin/llama-quantize $CONVERTED_MODEL $QUANTIZED_MODEL $QUANTIZED_TYPE
echo "Quantized model saved to: $QUANTIZED_MODEL"

View File

@@ -0,0 +1,22 @@
#!/bin/bash
set -e
#
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
# Final check if we have a model path
if [ -z "$CONVERTED_MODEL" ]; then
echo "Error: Model path must be provided either as:" >&2
echo " 1. Command line argument" >&2
echo " 2. CONVERTED_MODEL environment variable" >&2
exit 1
fi
echo $CONVERTED_MODEL
cmake --build ../../build --target llama-server
../../build/bin/llama-server -m $CONVERTED_MODEL \
--embedding \
--pooling none

View File

@@ -0,0 +1,179 @@
#!/usr/bin/env python3
import numpy as np
import argparse
import os
import importlib
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModel
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
def cosine_similarity(a, b=None):
a = np.asarray(a)
if b is None:
b = a
else:
b = np.asarray(b)
if a.ndim == 1:
a = a.reshape(1, -1)
if b.ndim == 1:
b = b.reshape(1, -1)
a_norms = np.linalg.norm(a, axis=1, keepdims=True)
b_norms = np.linalg.norm(b, axis=1, keepdims=True)
a_norms = np.where(a_norms == 0, 1e-8, a_norms)
b_norms = np.where(b_norms == 0, 1e-8, b_norms)
a_normalized = a / a_norms
b_normalized = b / b_norms
# Compute cosine similarity
return np.dot(a_normalized, b_normalized.T)
def load_embeddings_from_file(filename, n_tokens, n_embd):
embeddings = np.fromfile(filename, dtype=np.float32)
return embeddings.reshape(n_tokens, n_embd)
def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
np.set_printoptions(suppress=True, precision=6)
print("pytorch embeddings:");
print(python_emb)
print("llama.cpp embeddings:");
print(cpp_emb)
print(f"\n=== Prompt: '{prompt}' ===")
print(f"Tokens: {tokens}")
print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")
n_tokens = len(tokens)
# 1. Direct embedding comparison
print(f"\n1. Raw Embedding Magnitude Comparison:")
# Check if the distance of each token embedding from the origin and compare
# if the vectors are on the same "sphere". This does not tell us about
# direction (meaning of the token embedding), just magnitude.
for i in range(n_tokens):
py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
cpp_mag = np.linalg.norm(cpp_emb[i]) # calculate standard euclidean norm for llama.cpp embeddings
ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
print(f" Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
# 2. Cosine similarity between tokens within each model
# Here we check the direction of token embeddings to see if the have the
# same meaning (similarity). This is done by calculating cosine similarity
# of a pair of token embeddings within each model.
print(f"\n2. Within-Model Token Similarities:")
print(" Python model:")
for i in range(n_tokens):
for j in range(i+1, n_tokens):
sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
print(f" {tokens[i]}{tokens[j]}: {sim:.4f}")
print(" llama.cpp model:")
for i in range(n_tokens):
for j in range(i+1, n_tokens):
sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
print(f" {tokens[i]}{tokens[j]}: {sim:.4f}")
# 3. Cross-model similarity (same token position)
print(f"\n3. Cross-Model Same-Token Similarities:")
for i in range(n_tokens):
sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
print(f" Token {i} ({tokens[i]}): {sim:.4f}")
# 4. Similarity matrix comparison
print(f"\n4. Similarity Matrix Differences:")
py_sim_matrix = cosine_similarity(python_emb)
cpp_sim_matrix = cosine_similarity(cpp_emb)
diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
print(f" Max difference: {np.max(diff_matrix):.4f}")
print(f" Mean difference: {np.mean(diff_matrix):.4f}")
print(f" RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
return {
'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
'similarity_matrix_diff': diff_matrix,
'max_diff': np.max(diff_matrix),
'mean_diff': np.mean(diff_matrix),
'rms_diff': np.sqrt(np.mean(diff_matrix**2))
}
def main():
parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
parser.add_argument('--model-path', '-m', required=True, help='Path to the original Python model')
parser.add_argument('--python-embeddings', '-pe', help='Path to pytorch embeddings "logits" binary file')
parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
args = parser.parse_args()
print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
print("=" * 70)
# Single prompt detailed comparison
print(f"\nTesting with prompt: '{args.prompt}'")
# Load the python model to get configuration information and also to load the tokenizer.
print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
config = AutoConfig.from_pretrained(args.model_path)
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
if args.causal:
class_name = f"{unreleased_model_name}ForCausalLM"
else:
class_name = f"{unreleased_model_name}Model"
print(f"Model class: {class_name}")
print(f"Importing unreleased model module: {unreleased_module_path}")
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(args.model_path)
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
else:
if args.causal:
model = AutoModelForCausalLM.from_pretrained(args.model_path)
else:
model = AutoModel.from_pretrained(args.model_path)
encoded = tokenizer(args.prompt, return_tensors="pt")
tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
n_tokens = len(tokens)
print(f"n_tokens: {n_tokens}");
print(f"hidden_size: {model.config.hidden_size}")
# Load binary embeddings from data directory.
llamacpp_embeddings = load_embeddings_from_file(args.cpp_embeddings, n_tokens, model.config.hidden_size)
python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)
# Run comparison
results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, args.prompt)
# Summary
print(f"\n=== SUMMARY ===")
avg_cross_sim = np.mean(results['cross_model_similarities'])
print(f"Average cross-model similarity: {avg_cross_sim:.4f}")
print(f"Similarity matrix RMS difference: {results['rms_diff']:.4f}")
# Quality assessment
if avg_cross_sim > 0.95:
print("✅ EXCELLENT: Models are highly similar")
elif avg_cross_sim > 0.90:
print("✅ VERY GOOD: Models are very similar")
elif avg_cross_sim > 0.80:
print("⚠️ GOOD: Models are reasonably similar")
elif avg_cross_sim > 0.70:
print("⚠️ FAIR: Models have some differences")
else:
print("❌ POOR: Models are significantly different")
if __name__ == "__main__":
main()