#!/usr/bin/env bash set -e # Parse command line arguments MODEL_PATH="" MODEL_NAME="" PROMPTS_FILE="" # First argument is always model path if [ $# -gt 0 ] && [[ "$1" != --* ]]; then MODEL_PATH="$1" shift fi # Parse remaining arguments while [[ $# -gt 0 ]]; do case $1 in --prompts-file|-pf) PROMPTS_FILE="$2" shift 2 ;; *) # If MODEL_NAME not set and this isn't a flag, use as model name if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then MODEL_NAME="$1" fi shift ;; esac done # Set defaults MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}" MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}" if [ -t 0 ]; then CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin" else # Process piped JSON data and convert to binary (matching logits.cpp format) TEMP_FILE=$(mktemp /tmp/tmp.XXXXXX.binn) python3 -c " import json import sys import struct data = json.load(sys.stdin) # Flatten all embeddings completely flattened = [] for item in data: embedding = item['embedding'] for token_embedding in embedding: flattened.extend(token_embedding) print(f'Total embedding values: {len(flattened)}', file=sys.stderr) # Write as binary floats - matches logitc.cpp fwrite format with open('$TEMP_FILE', 'wb') as f: for value in flattened: f.write(struct.pack('f', value)) " CPP_EMBEDDINGS="$TEMP_FILE" trap "rm -f $TEMP_FILE" EXIT fi # Build the semantic_check.py command SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \ --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \ --cpp-embeddings $CPP_EMBEDDINGS" # Add prompts file if specified, otherwise use default prompt if [ -n "$PROMPTS_FILE" ]; then SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\"" else SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\"" fi # Execute the command eval $SEMANTIC_CMD