model-conversion : add support for SentenceTransformers (#16387)

* model-conversion : add support for SentenceTransformers

This commit adds support for models that use SentenceTransformer layers.

The motivation for this is that if converted model includes any of the
numbered layers specified in the original models repository then these
changes enable these models to be used and verified. Currently the
model-conversion only support the base model output without any of
the additional transformation layers.

Usage:
Convert the model that also includes the SentenceTransformer layers:
```console
(venv) $ export EMBEDDING_MODEL_PATH="~/google/embeddinggemma-300M"
(venv) make embedding-convert-model
```

Verify the produced embeddings from the converted model against the
original model embeddings:
```console
(venv) make embedding-verify-logits-st
```

The original model can be run using SentenceTransformer:
```console
(venv) make embedding-run-original-model-st
```

Run the converted model using "SentenceTransformer" layers whic
enables pooling and normalization:
```console
(venv) make embedding-run-converted-model-st
```

* add model-conversion example requirements

* add support for -st flag in embedding model conversion

This commit add support for the -st flag in the embedding model
conversion script. This will enable models to be converted using
sentence transformers dense layers.
This commit is contained in:
Daniel Bevenius
2025-10-09 14:35:22 +02:00
committed by GitHub
parent 2c0d875ae6
commit 56b4795842
9 changed files with 297 additions and 140 deletions

View File

@@ -2,6 +2,21 @@
set -e
# Parse command line arguments
SENTENCE_TRANSFORMERS=""
while [[ $# -gt 0 ]]; do
case $1 in
-st|--sentence-transformers)
SENTENCE_TRANSFORMERS="--sentence-transformers-dense-modules"
shift
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
MODEL_NAME="${MODEL_NAME:-$(basename "$EMBEDDING_MODEL_PATH")}"
OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
TYPE="${OUTTYPE:-f16}"
@@ -15,7 +30,8 @@ echo "Converted model path:: ${CONVERTED_MODEL}"
python ../../convert_hf_to_gguf.py --verbose \
${EMBEDDING_MODEL_PATH} \
--outfile ${CONVERTED_MODEL} \
--outtype ${TYPE}
--outtype ${TYPE} \
${SENTENCE_TRANSFORMERS}
echo ""
echo "The environment variable CONVERTED_EMBEDDING MODEL can be set to this path using:"

View File

@@ -5,6 +5,7 @@ set -e
# Parse command line arguments
CONVERTED_MODEL=""
PROMPTS_FILE=""
USE_POOLING=""
while [[ $# -gt 0 ]]; do
case $1 in
@@ -12,6 +13,10 @@ while [[ $# -gt 0 ]]; do
PROMPTS_FILE="$2"
shift 2
;;
--pooling)
USE_POOLING="1"
shift
;;
*)
if [ -z "$CONVERTED_MODEL" ]; then
CONVERTED_MODEL="$1"
@@ -47,4 +52,8 @@ echo $CONVERTED_MODEL
cmake --build ../../build --target llama-logits -j8
# TODO: update logits.cpp to accept a --file/-f option for the prompt
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
if [ -n "$USE_POOLING" ]; then
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode -pooling "$PROMPT"
else
../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
fi

View File

@@ -14,6 +14,8 @@ unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
parser.add_argument('--use-sentence-transformers', action='store_true',
help='Use SentenceTransformer to apply all numbered layers (01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
args = parser.parse_args()
def read_prompt_from_file(file_path):
@@ -31,41 +33,52 @@ model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Determine if we should use SentenceTransformer
use_sentence_transformers = args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
config = AutoConfig.from_pretrained(model_path)
# This can be used to override the sliding window size for manual testing. This
# can be useful to verify the sliding window attention mask in the original model
# and compare it with the converted .gguf model.
if hasattr(config, 'sliding_window'):
original_sliding_window = config.sliding_window
#original_sliding_window = 6
print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
print(f"Using unreleased model: {unreleased_model_name}")
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
class_name = f"{unreleased_model_name}Model"
print(f"Importing unreleased model module: {unreleased_module_path}")
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(model_path, config=config)
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
if use_sentence_transformers:
from sentence_transformers import SentenceTransformer
print("Using SentenceTransformer to apply all numbered layers")
model = SentenceTransformer(model_path)
tokenizer = model.tokenizer
config = model[0].auto_model.config # type: ignore
else:
model = AutoModel.from_pretrained(model_path, config=config)
print(f"Model class: {type(model)}")
print(f"Model file: {type(model).__module__}")
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
# This can be used to override the sliding window size for manual testing. This
# can be useful to verify the sliding window attention mask in the original model
# and compare it with the converted .gguf model.
if hasattr(config, 'sliding_window'):
original_sliding_window = config.sliding_window
#original_sliding_window = 6
print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
print(f"Using unreleased model: {unreleased_model_name}")
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
class_name = f"{unreleased_model_name}Model"
print(f"Importing unreleased model module: {unreleased_module_path}")
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(model_path, config=config)
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
else:
model = AutoModel.from_pretrained(model_path, config=config)
print(f"Model class: {type(model)}")
print(f"Model file: {type(model).__module__}")
# Verify the model is using the correct sliding window
if hasattr(model.config, 'sliding_window'):
print(f"Model's sliding_window: {model.config.sliding_window}")
else:
print("Model config does not have sliding_window attribute")
if not use_sentence_transformers:
if hasattr(model.config, 'sliding_window'): # type: ignore
print(f"Model's sliding_window: {model.config.sliding_window}") # type: ignore
else:
print("Model config does not have sliding_window attribute")
model_name = os.path.basename(model_path)
@@ -75,34 +88,56 @@ if args.prompts_file:
else:
texts = ["Hello world today"]
encoded = tokenizer(
texts,
padding=True,
truncation=True,
return_tensors="pt"
)
tokens = encoded['input_ids'][0]
token_strings = tokenizer.convert_ids_to_tokens(tokens)
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
print(f"{token_id:6d} -> '{token_str}'")
with torch.no_grad():
outputs = model(**encoded)
hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
if use_sentence_transformers:
embeddings = model.encode(texts, convert_to_numpy=True)
all_embeddings = embeddings # Shape: [batch_size, hidden_size]
# Extract embeddings for each token (matching LLAMA_POOLING_TYPE_NONE behavior)
all_embeddings = hidden_states[0].cpu().numpy() # Shape: [seq_len, hidden_size]
encoded = tokenizer(
texts,
padding=True,
truncation=True,
return_tensors="pt"
)
tokens = encoded['input_ids'][0]
token_strings = tokenizer.convert_ids_to_tokens(tokens)
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
print(f"{token_id:6d} -> '{token_str}'")
print(f"Hidden states shape: {hidden_states.shape}")
print(f"All embeddings shape: {all_embeddings.shape}")
print(f"Embedding dimension: {all_embeddings.shape[1]}")
print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}") # type: ignore
else:
# Standard approach: use base model output only
encoded = tokenizer(
texts,
padding=True,
truncation=True,
return_tensors="pt"
)
# Print embeddings exactly like embedding.cpp does for LLAMA_POOLING_TYPE_NONE
n_embd = all_embeddings.shape[1]
n_embd_count = all_embeddings.shape[0]
tokens = encoded['input_ids'][0]
token_strings = tokenizer.convert_ids_to_tokens(tokens)
for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
print(f"{token_id:6d} -> '{token_str}'")
print() # Empty line to match C++ output
outputs = model(**encoded)
hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_size]
all_embeddings = hidden_states[0].cpu().numpy() # Shape: [seq_len, hidden_size]
print(f"Hidden states shape: {hidden_states.shape}")
print(f"All embeddings shape: {all_embeddings.shape}")
print(f"Embedding dimension: {all_embeddings.shape[1]}")
if len(all_embeddings.shape) == 1:
n_embd = all_embeddings.shape[0] # type: ignore
n_embd_count = 1
all_embeddings = all_embeddings.reshape(1, -1)
else:
n_embd = all_embeddings.shape[1] # type: ignore
n_embd_count = all_embeddings.shape[0] # type: ignore
print()
for j in range(n_embd_count):
embedding = all_embeddings[j]
@@ -120,29 +155,23 @@ with torch.no_grad():
print() # New line
print() # Final empty line to match C++ output
print()
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"
# Save all embeddings flattened (matching what embedding.cpp would save if it did)
flattened_embeddings = all_embeddings.flatten()
flattened_embeddings.astype(np.float32).tofile(bin_filename)
with open(txt_filename, "w") as f:
f.write(f"# Model class: {model_name}\n")
f.write(f"# Tokens: {token_strings}\n")
f.write(f"# Shape: {all_embeddings.shape}\n")
f.write(f"# n_embd_count: {n_embd_count}, n_embd: {n_embd}\n\n")
idx = 0
for j in range(n_embd_count):
f.write(f"# Token {j} ({token_strings[j]}):\n")
for i, value in enumerate(all_embeddings[j]):
f.write(f"{j}_{i}: {value:.6f}\n")
f.write("\n")
print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} tokens × {n_embd} dimensions)")
for value in all_embeddings[j]:
f.write(f"{idx}: {value:.6f}\n")
idx += 1
print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
print("")
print(f"Saved bin embeddings to: {bin_filename}")
print(f"Saved txt embeddings to: {txt_filename}")

View File

@@ -35,7 +35,11 @@ def cosine_similarity(a, b=None):
def load_embeddings_from_file(filename, n_tokens, n_embd):
embeddings = np.fromfile(filename, dtype=np.float32)
return embeddings.reshape(n_tokens, n_embd)
# Check if this is pooled (single embedding) or per-token embeddings
if len(embeddings) == n_embd:
return embeddings.reshape(1, n_embd)
else:
return embeddings.reshape(n_tokens, n_embd)
def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
np.set_printoptions(suppress=True, precision=6)
@@ -48,58 +52,83 @@ def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")
n_tokens = len(tokens)
is_pooled = python_emb.shape[0] == 1
# 1. Direct embedding comparison
print(f"\n1. Raw Embedding Magnitude Comparison:")
# Check if the distance of each token embedding from the origin and compare
# if the vectors are on the same "sphere". This does not tell us about
# direction (meaning of the token embedding), just magnitude.
for i in range(n_tokens):
py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
cpp_mag = np.linalg.norm(cpp_emb[i]) # calculate standard euclidean norm for llama.cpp embeddings
if is_pooled:
print(f"\n[Pooled Embeddings Mode - comparing single sentence embeddings]")
# 1. Direct embedding comparison for pooled embeddings
print(f"\n1. Raw Embedding Magnitude Comparison:")
py_mag = np.linalg.norm(python_emb[0])
cpp_mag = np.linalg.norm(cpp_emb[0])
ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
print(f" Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
print(f" Pooled embedding: Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
# 2. Cosine similarity between tokens within each model
# Here we check the direction of token embeddings to see if the have the
# same meaning (similarity). This is done by calculating cosine similarity
# of a pair of token embeddings within each model.
print(f"\n2. Within-Model Token Similarities:")
print(" Python model:")
for i in range(n_tokens):
for j in range(i+1, n_tokens):
sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
print(f" {tokens[i]}{tokens[j]}: {sim:.4f}")
# 2. Cross-model similarity for pooled embeddings
print(f"\n2. Cross-Model Pooled Embedding Similarity:")
sim = cosine_similarity([python_emb[0]], [cpp_emb[0]])[0][0]
print(f" Cosine similarity: {sim:.6f}")
print(" llama.cpp model:")
for i in range(n_tokens):
for j in range(i+1, n_tokens):
sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
print(f" {tokens[i]}{tokens[j]}: {sim:.4f}")
return {
'cross_model_similarities': [sim],
'similarity_matrix_diff': np.array([[0.0]]),
'max_diff': 0.0,
'mean_diff': 0.0,
'rms_diff': 0.0
}
else:
# Original per-token comparison logic
# 1. Direct embedding comparison
print(f"\n1. Raw Embedding Magnitude Comparison:")
# Check if the distance of each token embedding from the origin and compare
# if the vectors are on the same "sphere". This does not tell us about
# direction (meaning of the token embedding), just magnitude.
for i in range(n_tokens):
py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
cpp_mag = np.linalg.norm(cpp_emb[i]) # calculate standard euclidean norm for llama.cpp embeddings
ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
print(f" Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
# 3. Cross-model similarity (same token position)
print(f"\n3. Cross-Model Same-Token Similarities:")
for i in range(n_tokens):
sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
print(f" Token {i} ({tokens[i]}): {sim:.4f}")
# 2. Cosine similarity between tokens within each model
# Here we check the direction of token embeddings to see if the have the
# same meaning (similarity). This is done by calculating cosine similarity
# of a pair of token embeddings within each model.
print(f"\n2. Within-Model Token Similarities:")
print(" Python model:")
for i in range(n_tokens):
for j in range(i+1, n_tokens):
sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
print(f" {tokens[i]}{tokens[j]}: {sim:.4f}")
# 4. Similarity matrix comparison
print(f"\n4. Similarity Matrix Differences:")
py_sim_matrix = cosine_similarity(python_emb)
cpp_sim_matrix = cosine_similarity(cpp_emb)
diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
print(" llama.cpp model:")
for i in range(n_tokens):
for j in range(i+1, n_tokens):
sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
print(f" {tokens[i]}{tokens[j]}: {sim:.4f}")
print(f" Max difference: {np.max(diff_matrix):.4f}")
print(f" Mean difference: {np.mean(diff_matrix):.4f}")
print(f" RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
# 3. Cross-model similarity (same token position)
print(f"\n3. Cross-Model Same-Token Similarities:")
for i in range(n_tokens):
sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
print(f" Token {i} ({tokens[i]}): {sim:.4f}")
return {
'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
'similarity_matrix_diff': diff_matrix,
'max_diff': np.max(diff_matrix),
'mean_diff': np.mean(diff_matrix),
'rms_diff': np.sqrt(np.mean(diff_matrix**2))
}
# 4. Similarity matrix comparison
print(f"\n4. Similarity Matrix Differences:")
py_sim_matrix = cosine_similarity(python_emb)
cpp_sim_matrix = cosine_similarity(cpp_emb)
diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
print(f" Max difference: {np.max(diff_matrix):.4f}")
print(f" Mean difference: {np.mean(diff_matrix):.4f}")
print(f" RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
return {
'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
'similarity_matrix_diff': diff_matrix,
'max_diff': np.max(diff_matrix),
'mean_diff': np.mean(diff_matrix),
'rms_diff': np.sqrt(np.mean(diff_matrix**2))
}
def read_prompt_from_file(file_path):
try: