mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	examples : add compare-mlx
This commit is contained in:
		
							
								
								
									
										2
									
								
								examples/compare-mlx/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								examples/compare-mlx/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| *.txt | ||||
| */ | ||||
							
								
								
									
										706
									
								
								examples/compare-mlx/compare-mlx.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										706
									
								
								examples/compare-mlx/compare-mlx.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,706 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # a script to compare MLX and GGUF models | ||||
| # | ||||
| # usage: | ||||
| #   ./compare-mlx.sh --raw-path wiki.test.raw --no-keep | ||||
| # | ||||
| # TODOs | ||||
| # - add QAT evals | ||||
|  | ||||
| # check if LLAMA_HOME_DIR is set | ||||
| if [[ -z "$LLAMA_HOME_DIR" ]]; then | ||||
|     lcpp_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")"/../../ && pwd) | ||||
| else | ||||
|     lcpp_dir="${LLAMA_HOME_DIR}" | ||||
| fi | ||||
|  | ||||
| echo "Using llama.cpp directory: ${lcpp_dir}" | ||||
|  | ||||
| # check for convert_hf_to_gguf.py | ||||
| if [[ ! -f "${lcpp_dir}/convert_hf_to_gguf.py" ]]; then | ||||
|     echo "convert_hf_to_gguf.py not found in ${lcpp_dir}" | ||||
|     echo "Set a LLAMA_HOME_DIR environment variable to point to your llama.cpp directory" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| set -x | ||||
|  | ||||
| # sanity checks that all Python dependencies are installed | ||||
| if ! python -c "import mlx.core"; then | ||||
|     echo "MLX not found. Please install MLX" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| if ! python ${lcpp_dir}/convert_hf_to_gguf.py --help; then | ||||
|     echo "convert_hf_to_gguf.py not working. Please install llama.cpp python requirements" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| # by default use the system binaries (for example from brew) | ||||
| llama_perplexity="llama-perplexity" | ||||
|  | ||||
| if [[ ! -z "$LLAMA_PERPLEXITY" ]]; then | ||||
|     llama_perplexity="$LLAMA_PERPLEXITY" | ||||
| fi | ||||
|  | ||||
| echo "Using llama-perplexity: ${llama_perplexity}" | ||||
|  | ||||
| if ! command -v "$llama_perplexity" &> /dev/null; then | ||||
|     echo "llama-perplexity not found. Please install it." | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| llama_quantize="llama-quantize" | ||||
|  | ||||
| if [[ ! -z "$LLAMA_QUANTIZE" ]]; then | ||||
|     llama_quantize="$LLAMA_QUANTIZE" | ||||
| fi | ||||
|  | ||||
| echo "Using llama-quantize: ${llama_quantize}" | ||||
|  | ||||
| if ! command -v "$llama_quantize" &> /dev/null; then | ||||
|     echo "llama-quantize not found. Please install it." | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| llama_batched_bench="llama-batched-bench" | ||||
|  | ||||
| if [[ ! -z "$LLAMA_BATCHED_BENCH" ]]; then | ||||
|     llama_batched_bench="$LLAMA_BATCHED_BENCH" | ||||
| fi | ||||
|  | ||||
| echo "Using llama-batched-bench: ${llama_batched_bench}" | ||||
|  | ||||
| if ! command -v "$llama_batched_bench" &> /dev/null; then | ||||
|     echo "llama-batched-bench not found. Please install it." | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| # get the size in GiB | ||||
| get_size() { | ||||
|     local path="$1" | ||||
|     local bytes=$(du -s "$path" | awk '{print $1}') | ||||
|     local res=$(echo "scale=3; ($bytes*512)/1024/1024/1024" | bc) | ||||
|     echo "$res" | ||||
| } | ||||
|  | ||||
| # parameters: | ||||
| # --no-compute      : do not compute anything, just summarize the existing results | ||||
| # --no-ppl          : do not compute ppl | ||||
| # --no-perf         : do not compute performance (speed) metrics | ||||
| # --no-keep         : delete intermediate model files | ||||
| # --num-samples     : number of text samples to evaluate (default: 512) | ||||
| # --sequence-length : sequence length of the samples in tokens (default: 512) | ||||
| # --raw-path        : file with raw text (such as wikitext) | ||||
|  | ||||
| # extra agruments | ||||
| args_lcpp="-t 1" | ||||
|  | ||||
| num_samples=512 | ||||
| sequence_length=512 | ||||
| raw_path="" | ||||
| no_compute=false | ||||
| no_ppl=false | ||||
| no_perf=false | ||||
| no_keep=false | ||||
|  | ||||
| while [[ $# -gt 0 ]]; do | ||||
|     case $1 in | ||||
|         --no-compute) | ||||
|             no_compute=true | ||||
|             shift | ||||
|             ;; | ||||
|         --no-ppl) | ||||
|             no_ppl=true | ||||
|             shift | ||||
|             ;; | ||||
|         --no-perf) | ||||
|             no_perf=true | ||||
|             shift | ||||
|             ;; | ||||
|         --no-keep) | ||||
|             no_keep=true | ||||
|             shift | ||||
|             ;; | ||||
|         --num-samples) | ||||
|             num_samples="$2" | ||||
|             shift 2 | ||||
|             ;; | ||||
|         --sequence-length) | ||||
|             sequence_length="$2" | ||||
|             shift 2 | ||||
|             ;; | ||||
|         --raw-path) | ||||
|             raw_path="$2" | ||||
|             shift 2 | ||||
|             ;; | ||||
|         *) | ||||
|             echo "Unknown parameter: $1" | ||||
|             exit 1 | ||||
|             ;; | ||||
|     esac | ||||
| done | ||||
|  | ||||
| if [[ -z "$raw_path" ]]; then | ||||
|     echo "No raw path provided" | ||||
|     echo "Recommended to use the test set of WikiText from here: https://github.com/ggml-org/llama.cpp/blob/master/scripts/get-wikitext-2.sh" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| eval_model() { | ||||
|     org="$1" | ||||
|     mid="$2" | ||||
|  | ||||
|     echo "Evaluating ${org}/${mid}" | ||||
|  | ||||
|     huggingface-cli download ${org}/${mid} --local-dir ${org}/${mid} | ||||
|  | ||||
|     # generate and process MLX models | ||||
|  | ||||
|     if [[ "$no_compute" == true ]]; then | ||||
|         echo "Skipping computation" | ||||
|     else | ||||
|         rm -rfv ./${mid}-f32-mlx | ||||
|         mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-f32-mlx --dtype float32 | ||||
|         get_size ./${mid}-f32-mlx > ./${mid}-f32-mlx-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             python ./mlx-ppl.py --model ./${mid}-f32-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-f32-mlx-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         # no need for F32 perf benchmarks | ||||
|         #if [[ "$no_perf" == false ]]; then | ||||
|         #    mlx_lm.benchmark --model ./${mid}-f32-mlx -p 2048  -g 128 --num-trials 1 2>&1 | tee ./${mid}-f32-mlx-perf-2048.txt | ||||
|         #    mlx_lm.benchmark --model ./${mid}-f32-mlx -p 4096  -g 128 --num-trials 1 2>&1 | tee ./${mid}-f32-mlx-perf-4096.txt | ||||
|         #    mlx_lm.benchmark --model ./${mid}-f32-mlx -p 8192  -g 128 --num-trials 1 2>&1 | tee ./${mid}-f32-mlx-perf-8192.txt | ||||
|         #    mlx_lm.benchmark --model ./${mid}-f32-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f32-mlx-perf-16384.txt | ||||
|         #    mlx_lm.benchmark --model ./${mid}-f32-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f32-mlx-perf-32768.txt | ||||
|         #fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-f32-mlx | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-bf16-mlx | ||||
|         mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-bf16-mlx --dtype bfloat16 | ||||
|         get_size ./${mid}-bf16-mlx > ./${mid}-bf16-mlx-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             python ./mlx-ppl.py --model ./${mid}-bf16-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-bf16-mlx-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             mlx_lm.benchmark --model ./${mid}-bf16-mlx -p 2048  -g 128 --num-trials 1 2>&1 | tee ./${mid}-bf16-mlx-perf-2048.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-bf16-mlx -p 4096  -g 128 --num-trials 1 2>&1 | tee ./${mid}-bf16-mlx-perf-4096.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-bf16-mlx -p 8192  -g 128 --num-trials 1 2>&1 | tee ./${mid}-bf16-mlx-perf-8192.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-bf16-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-bf16-mlx-perf-16384.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-bf16-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-bf16-mlx-perf-32768.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-bf16-mlx | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-f16-mlx | ||||
|         mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-f16-mlx --dtype float16 | ||||
|         get_size ./${mid}-f16-mlx > ./${mid}-f16-mlx-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             python ./mlx-ppl.py --model ./${mid}-f16-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-f16-mlx-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             mlx_lm.benchmark --model ./${mid}-f16-mlx -p 2048  -g 128 --num-trials 1 2>&1 | tee ./${mid}-f16-mlx-perf-2048.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-f16-mlx -p 4096  -g 128 --num-trials 1 2>&1 | tee ./${mid}-f16-mlx-perf-4096.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-f16-mlx -p 8192  -g 128 --num-trials 1 2>&1 | tee ./${mid}-f16-mlx-perf-8192.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-f16-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f16-mlx-perf-16384.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-f16-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-f16-mlx-perf-32768.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-f16-mlx | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-q8-mlx | ||||
|         mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q8-mlx --quantize --q-bits 8 | ||||
|         get_size ./${mid}-q8-mlx > ./${mid}-q8-mlx-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             python ./mlx-ppl.py --model ./${mid}-q8-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q8-mlx-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             mlx_lm.benchmark --model ./${mid}-q8-mlx -p 2048  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q8-mlx-perf-2048.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q8-mlx -p 4096  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q8-mlx-perf-4096.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q8-mlx -p 8192  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q8-mlx-perf-8192.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q8-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q8-mlx-perf-16384.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q8-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q8-mlx-perf-32768.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q8-mlx | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-q6-mlx | ||||
|         mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q6-mlx --quantize --q-bits 6 | ||||
|         get_size ./${mid}-q6-mlx > ./${mid}-q6-mlx-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             python ./mlx-ppl.py --model ./${mid}-q6-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q6-mlx-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             mlx_lm.benchmark --model ./${mid}-q6-mlx -p 2048  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q6-mlx-perf-2048.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q6-mlx -p 4096  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q6-mlx-perf-4096.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q6-mlx -p 8192  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q6-mlx-perf-8192.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q6-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q6-mlx-perf-16384.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q6-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q6-mlx-perf-32768.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q6-mlx | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-q5-mlx | ||||
|         mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q5-mlx --quantize --q-bits 5 | ||||
|         get_size ./${mid}-q5-mlx > ./${mid}-q5-mlx-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             python ./mlx-ppl.py --model ./${mid}-q5-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q5-mlx-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             mlx_lm.benchmark --model ./${mid}-q5-mlx -p 2048  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q5-mlx-perf-2048.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q5-mlx -p 4096  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q5-mlx-perf-4096.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q5-mlx -p 8192  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q5-mlx-perf-8192.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q5-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q5-mlx-perf-16384.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q5-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q5-mlx-perf-32768.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q5-mlx | ||||
|         fi | ||||
|  | ||||
|         # I think this is something similar to q4_k | ||||
|         rm -rfv ./${mid}-q4p-mlx | ||||
|         mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q4p-mlx --quantize --quant-predicate mixed_4_6 | ||||
|         get_size ./${mid}-q4p-mlx > ./${mid}-q4p-mlx-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             python ./mlx-ppl.py --model ./${mid}-q4p-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q4p-mlx-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             mlx_lm.benchmark --model ./${mid}-q4p-mlx -p 2048  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4p-mlx-perf-2048.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q4p-mlx -p 4096  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4p-mlx-perf-4096.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q4p-mlx -p 8192  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4p-mlx-perf-8192.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q4p-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4p-mlx-perf-16384.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q4p-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4p-mlx-perf-32768.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q4p-mlx | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-q4-mlx | ||||
|         mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q4-mlx --quantize --q-bits 4 | ||||
|         get_size ./${mid}-q4-mlx > ./${mid}-q4-mlx-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             python ./mlx-ppl.py --model ./${mid}-q4-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q4-mlx-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             mlx_lm.benchmark --model ./${mid}-q4-mlx -p 2048  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4-mlx-perf-2048.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q4-mlx -p 4096  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4-mlx-perf-4096.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q4-mlx -p 8192  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4-mlx-perf-8192.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q4-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4-mlx-perf-16384.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q4-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q4-mlx-perf-32768.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q4-mlx | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-q3-mlx | ||||
|         mlx_lm.convert --hf ./${org}/${mid} --mlx-path ./${mid}-q3-mlx --quantize --q-bits 3 | ||||
|         get_size ./${mid}-q3-mlx > ./${mid}-q3-mlx-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             python ./mlx-ppl.py --model ./${mid}-q3-mlx --raw-path "$raw_path" --num-samples "$num_samples" --sequence-length "$sequence_length" 2>&1 | tee ./${mid}-q3-mlx-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             mlx_lm.benchmark --model ./${mid}-q3-mlx -p 2048  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q3-mlx-perf-2048.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q3-mlx -p 4096  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q3-mlx-perf-4096.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q3-mlx -p 8192  -g 128 --num-trials 1 2>&1 | tee ./${mid}-q3-mlx-perf-8192.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q3-mlx -p 16384 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q3-mlx-perf-16384.txt | ||||
|             mlx_lm.benchmark --model ./${mid}-q3-mlx -p 32768 -g 128 --num-trials 1 2>&1 | tee ./${mid}-q3-mlx-perf-32768.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q3-mlx | ||||
|         fi | ||||
|     fi | ||||
|  | ||||
|     # generate and process llama.cpp GGUF models | ||||
|  | ||||
|     if [[ "$no_compute" == true ]]; then | ||||
|         echo "Skipping computation" | ||||
|     else | ||||
|         # the F32 model is the reference - we generate all other models from it | ||||
|         mkdir -p ./${mid}-f32-gguf | ||||
|         python ${lcpp_dir}/convert_hf_to_gguf.py ./${org}/${mid} --outtype f32 --outfile ./${mid}-f32-gguf/model.gguf | ||||
|         get_size ./${mid}-f32-gguf > ./${mid}-f32-gguf-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             ${llama_perplexity} $args_lcpp -m ./${mid}-f32-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-f32-gguf-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         # no need for F32 perf benchmarks | ||||
|         #if [[ "$no_perf" == false ]]; then | ||||
|         #    ${llama_batched_bench} $args_lcpp -m ./${mid}-f32-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-f32-gguf-perf.txt | ||||
|         #fi | ||||
|  | ||||
|         # this requires to explicitly build llama.cpp with BF16 support | ||||
|         rm -rfv ./${mid}-bf16-gguf && mkdir -p ./${mid}-bf16-gguf | ||||
|         ${llama_quantize} ./${mid}-f32-gguf/model.gguf ./${mid}-bf16-gguf/model.gguf bf16 | ||||
|         get_size ./${mid}-bf16-gguf > ./${mid}-bf16-gguf-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             ${llama_perplexity} $args_lcpp -m ./${mid}-bf16-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-bf16-gguf-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             ${llama_batched_bench} $args_lcpp -m ./${mid}-bf16-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-bf16-gguf-perf.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-bf16-gguf | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-f16-gguf && mkdir -p ./${mid}-f16-gguf | ||||
|         ${llama_quantize} ./${mid}-f32-gguf/model.gguf ./${mid}-f16-gguf/model.gguf f16 | ||||
|         get_size ./${mid}-f16-gguf > ./${mid}-f16-gguf-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             ${llama_perplexity} $args_lcpp -m ./${mid}-f16-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-f16-gguf-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             ${llama_batched_bench} $args_lcpp -m ./${mid}-f16-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-f16-gguf-perf.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-f16-gguf | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-q8-gguf && mkdir -p ./${mid}-q8-gguf | ||||
|         ${llama_quantize} ./${mid}-f32-gguf/model.gguf ./${mid}-q8-gguf/model.gguf q8_0 | ||||
|         get_size ./${mid}-q8-gguf > ./${mid}-q8-gguf-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             ${llama_perplexity} $args_lcpp -m ./${mid}-q8-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q8-gguf-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             ${llama_batched_bench} $args_lcpp -m ./${mid}-q8-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q8-gguf-perf.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q8-gguf | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-q6-gguf && mkdir -p ./${mid}-q6-gguf | ||||
|         ${llama_quantize}  ./${mid}-f32-gguf/model.gguf ./${mid}-q6-gguf/model.gguf q6_k | ||||
|         get_size ./${mid}-q6-gguf > ./${mid}-q6-gguf-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             ${llama_perplexity} $args_lcpp -m ./${mid}-q6-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q6-gguf-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             ${llama_batched_bench} $args_lcpp -m ./${mid}-q6-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q6-gguf-perf.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q6-gguf | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-q5-gguf && mkdir -p ./${mid}-q5-gguf | ||||
|         ${llama_quantize}  ./${mid}-f32-gguf/model.gguf ./${mid}-q5-gguf/model.gguf q5_k_s | ||||
|         get_size ./${mid}-q5-gguf > ./${mid}-q5-gguf-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             ${llama_perplexity} $args_lcpp -m ./${mid}-q5-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q5-gguf-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             ${llama_batched_bench} $args_lcpp -m ./${mid}-q5-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q5-gguf-perf.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q5-gguf | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-q4p-gguf && mkdir -p ./${mid}-q4p-gguf | ||||
|         ${llama_quantize}  ./${mid}-f32-gguf/model.gguf ./${mid}-q4p-gguf/model.gguf q4_k | ||||
|         get_size ./${mid}-q4p-gguf > ./${mid}-q4p-gguf-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             ${llama_perplexity} $args_lcpp -m ./${mid}-q4p-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q4p-gguf-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             ${llama_batched_bench} $args_lcpp -m ./${mid}-q4p-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q4p-gguf-perf.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q4p-gguf | ||||
|         fi | ||||
|  | ||||
|         # note: we use --pure here to match the MLX quantization of the embeddings | ||||
|         rm -rfv ./${mid}-q4-gguf && mkdir -p ./${mid}-q4-gguf | ||||
|         ${llama_quantize} --pure ./${mid}-f32-gguf/model.gguf ./${mid}-q4-gguf/model.gguf q4_0 | ||||
|         get_size ./${mid}-q4-gguf > ./${mid}-q4-gguf-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             ${llama_perplexity} $args_lcpp -m ./${mid}-q4-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q4-gguf-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             ${llama_batched_bench} $args_lcpp -m ./${mid}-q4-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q4-gguf-perf.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q4-gguf | ||||
|         fi | ||||
|  | ||||
|         rm -rfv ./${mid}-q3-gguf && mkdir -p ./${mid}-q3-gguf | ||||
|         ${llama_quantize}  ./${mid}-f32-gguf/model.gguf ./${mid}-q3-gguf/model.gguf q3_k_s | ||||
|         get_size ./${mid}-q3-gguf > ./${mid}-q3-gguf-size.txt | ||||
|  | ||||
|         if [[ "$no_ppl" == false ]]; then | ||||
|             ${llama_perplexity} $args_lcpp -m ./${mid}-q3-gguf/model.gguf -f "$raw_path" --chunks "${num_samples}" -c "${sequence_length}" 2>&1 | tee ./${mid}-q3-gguf-ppl.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_perf" == false ]]; then | ||||
|             ${llama_batched_bench} $args_lcpp -m ./${mid}-q3-gguf/model.gguf -c 33768 -b 2048 -ub 2048 -npp 2048,4096,8192,16384,32768 -ntg 128 -npl 1 2>&1 | tee ./${mid}-q3-gguf-perf.txt | ||||
|         fi | ||||
|  | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             echo "Deleting intermediate model files" | ||||
|             rm -rfv ./${mid}-q3-gguf | ||||
|         fi | ||||
|  | ||||
|         # remove the f32 model at the end | ||||
|         if [[ "$no_keep" == true ]]; then | ||||
|             rm -rfv ./${mid}-f32-gguf | ||||
|         fi | ||||
|     fi | ||||
|  | ||||
|     set +x | ||||
|  | ||||
|     # analyze results | ||||
|  | ||||
|     types=("f32" "bf16" "f16" "q8" "q6" "q5" "q4p" "q4" "q3") | ||||
|  | ||||
|     mlx_ppls=() | ||||
|     mlx_ppl_deltas=() | ||||
|     mlx_sizes=() | ||||
|     mlx_pps2k=() | ||||
|     mlx_tgs2k=() | ||||
|     mlx_pps4k=() | ||||
|     mlx_tgs4k=() | ||||
|     mlx_pps8k=() | ||||
|     mlx_tgs8k=() | ||||
|     mlx_pps16k=() | ||||
|     mlx_tgs16k=() | ||||
|     mlx_pps32k=() | ||||
|     mlx_tgs32k=() | ||||
|  | ||||
|     # mlx: | ||||
|     for t in ${types[*]}; do | ||||
|         cur_ppl="N/A" | ||||
|         cur_ppl_delta="N/A" | ||||
|         cur_size="N/A" | ||||
|         cur_pp2k="N/A" | ||||
|         cur_tg2k="N/A" | ||||
|         cur_pp4k="N/A" | ||||
|         cur_tg4k="N/A" | ||||
|         cur_pp8k="N/A" | ||||
|         cur_tg8k="N/A" | ||||
|         cur_pp16k="N/A" | ||||
|         cur_tg16k="N/A" | ||||
|         cur_pp32k="N/A" | ||||
|         cur_tg32k="N/A" | ||||
|  | ||||
|         if [[ -f ./${mid}-${t}-mlx-ppl.txt ]]; then | ||||
|             cur_ppl=$(grep -o 'Perplexity: [0-9.]*' ./${mid}-${t}-mlx-ppl.txt | cut -d' ' -f2) | ||||
|             cur_ppl_delta=$(grep -o 'Perplexity: [0-9.]* ± [0-9.]*' ./${mid}-${t}-mlx-ppl.txt | cut -d' ' -f4) | ||||
|             cur_size=$(cat ./${mid}-${t}-mlx-size.txt) | ||||
|             cur_pp2k=$(grep -o 'Averages.*prompt_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-2048.txt | cut -d'=' -f2) | ||||
|             cur_tg2k=$(grep -o 'Averages.*generation_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-2048.txt | cut -d'=' -f3) | ||||
|             cur_pp4k=$(grep -o 'Averages.*prompt_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-4096.txt | cut -d'=' -f2) | ||||
|             cur_tg4k=$(grep -o 'Averages.*generation_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-4096.txt | cut -d'=' -f3) | ||||
|             cur_pp8k=$(grep -o 'Averages.*prompt_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-8192.txt | cut -d'=' -f2) | ||||
|             cur_tg8k=$(grep -o 'Averages.*generation_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-8192.txt | cut -d'=' -f3) | ||||
|             cur_pp16k=$(grep -o 'Averages.*prompt_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-16384.txt | cut -d'=' -f2) | ||||
|             cur_tg16k=$(grep -o 'Averages.*generation_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-16384.txt | cut -d'=' -f3) | ||||
|             cur_pp32k=$(grep -o 'Averages.*prompt_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-32768.txt | cut -d'=' -f2) | ||||
|             cur_tg32k=$(grep -o 'Averages.*generation_tps=[0-9.]*' ./${mid}-${t}-mlx-perf-32768.txt | cut -d'=' -f3) | ||||
|         fi | ||||
|  | ||||
|         mlx_ppls+=("${cur_ppl}") | ||||
|         mlx_ppl_deltas+=("${cur_ppl_delta}") | ||||
|         mlx_sizes+=("${cur_size}") | ||||
|         mlx_pps2k+=("${cur_pp2k}") | ||||
|         mlx_tgs2k+=("${cur_tg2k}") | ||||
|         mlx_pps4k+=("${cur_pp4k}") | ||||
|         mlx_tgs4k+=("${cur_tg4k}") | ||||
|         mlx_pps8k+=("${cur_pp8k}") | ||||
|         mlx_tgs8k+=("${cur_tg8k}") | ||||
|         mlx_pps16k+=("${cur_pp16k}") | ||||
|         mlx_tgs16k+=("${cur_tg16k}") | ||||
|         mlx_pps32k+=("${cur_pp32k}") | ||||
|         mlx_tgs32k+=("${cur_tg32k}") | ||||
|     done | ||||
|  | ||||
|     gguf_ppls=() | ||||
|     gguf_ppl_deltas=() | ||||
|     gguf_sizes=() | ||||
|     gguf_pps2k=() | ||||
|     gguf_tgs2k=() | ||||
|     gguf_pps4k=() | ||||
|     gguf_tgs4k=() | ||||
|     gguf_pps8k=() | ||||
|     gguf_tgs8k=() | ||||
|     gguf_pps16k=() | ||||
|     gguf_tgs16k=() | ||||
|     gguf_pps32k=() | ||||
|     gguf_tgs32k=() | ||||
|  | ||||
|     # gguf: | ||||
|     for t in ${types[*]}; do | ||||
|         cur_ppl="N/A" | ||||
|         cur_ppl_delta="N/A" | ||||
|         cur_size="N/A" | ||||
|         cur_pp2k="N/A" | ||||
|         cur_tg2k="N/A" | ||||
|         cur_pp4k="N/A" | ||||
|         cur_tg4k="N/A" | ||||
|         cur_pp8k="N/A" | ||||
|         cur_tg8k="N/A" | ||||
|         cur_pp16k="N/A" | ||||
|         cur_tg16k="N/A" | ||||
|         cur_pp32k="N/A" | ||||
|         cur_tg32k="N/A" | ||||
|  | ||||
|         if [[ -f ./${mid}-${t}-gguf-ppl.txt ]]; then | ||||
|             cur_ppl=$(grep -o 'Final estimate: PPL = [0-9.]*' ./${mid}-${t}-gguf-ppl.txt | sed -e "s/.*Final//" | cut -d' ' -f5) | ||||
|             cur_ppl_delta=$(grep -o 'Final estimate: PPL = [0-9.]* +/- [0-9.]*' ./${mid}-${t}-gguf-ppl.txt | sed -e "s/.*Final//" | cut -d' ' -f7) | ||||
|             cur_size=$(cat ./${mid}-${t}-gguf-size.txt) | ||||
|             cur_pp2k=$(grep  -o '|  2048 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $12}') | ||||
|             cur_tg2k=$(grep  -o '|  2048 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $16}') | ||||
|             cur_pp4k=$(grep  -o '|  4096 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $12}') | ||||
|             cur_tg4k=$(grep  -o '|  4096 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $16}') | ||||
|             cur_pp8k=$(grep  -o '|  8192 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $12}') | ||||
|             cur_tg8k=$(grep  -o '|  8192 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $16}') | ||||
|             cur_pp16k=$(grep -o '| 16384 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $12}') | ||||
|             cur_tg16k=$(grep -o '| 16384 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $16}') | ||||
|             cur_pp32k=$(grep -o '| 32768 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $12}') | ||||
|             cur_tg32k=$(grep -o '| 32768 |.*' ./${mid}-${t}-gguf-perf.txt | awk '{print $16}') | ||||
|         fi | ||||
|  | ||||
|         gguf_ppls+=("${cur_ppl}") | ||||
|         gguf_ppl_deltas+=("${cur_ppl_delta}") | ||||
|         gguf_sizes+=("${cur_size}") | ||||
|         gguf_pps2k+=("${cur_pp2k}") | ||||
|         gguf_tgs2k+=("${cur_tg2k}") | ||||
|         gguf_pps4k+=("${cur_pp4k}") | ||||
|         gguf_tgs4k+=("${cur_tg4k}") | ||||
|         gguf_pps8k+=("${cur_pp8k}") | ||||
|         gguf_tgs8k+=("${cur_tg8k}") | ||||
|         gguf_pps16k+=("${cur_pp16k}") | ||||
|         gguf_tgs16k+=("${cur_tg16k}") | ||||
|         gguf_pps32k+=("${cur_pp32k}") | ||||
|         gguf_tgs32k+=("${cur_tg32k}") | ||||
|     done | ||||
|  | ||||
|     res="${mid}-results.txt" | ||||
|     echo "Results for ${org}/${mid} saved to ${res}" | ||||
|  | ||||
|     printf "\n" | tee ${res} | ||||
|     printf "Model ID:        ${org}/${mid}\n" | tee -a ${res} | ||||
|     #printf "Samples:         ${num_samples}\n" | tee -a ${res} | ||||
|     #printf "Sequence Length: ${sequence_length}\n" | tee -a ${res} | ||||
|     printf "\n" | tee -a ${res} | ||||
|     printf "| Type  | MLX PPL             | GGUF PPL               | MLX Size | GGUF Size | MLX PP  2K | GGUF PP  2K | MLX TG  2K | GGUF TG  2K | MLX PP  4K | GGUF PP  4K | MLX TG  4K | GGUF TG  4K | MLX PP  8K | GGUF PP  8K | MLX TG  8K | GGUF TG  8K | MLX PP 16K | GGUF PP 16K | MLX TG 16K | GGUF TG 16K | MLX PP 32K | GGUF PP 32K | MLX TG 32K | GGUF TG 32K |\n" | tee -a ${res} | ||||
|     printf "|-------|---------------------|------------------------|----------|-----------| ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- | ---------- | ----------- |\n" | tee -a ${res} | ||||
|  | ||||
|     for i in "${!types[@]}"; do | ||||
|         printf "| %-5s | %10s ± %6s | %10s ± %9s | %8s | %9s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s | %10s | %11s |\n" \ | ||||
|             "${types[i]}" \ | ||||
|             "${mlx_ppls[i]}" \ | ||||
|             "${mlx_ppl_deltas[i]}" \ | ||||
|             "${gguf_ppls[i]}" \ | ||||
|             "${gguf_ppl_deltas[i]}" \ | ||||
|             "${mlx_sizes[i]}" \ | ||||
|             "${gguf_sizes[i]}" \ | ||||
|             "${mlx_pps2k[i]}" \ | ||||
|             "${gguf_pps2k[i]}" \ | ||||
|             "${mlx_tgs2k[i]}" \ | ||||
|             "${gguf_tgs2k[i]}" \ | ||||
|             "${mlx_pps4k[i]}" \ | ||||
|             "${gguf_pps4k[i]}" \ | ||||
|             "${mlx_tgs4k[i]}" \ | ||||
|             "${gguf_tgs4k[i]}" \ | ||||
|             "${mlx_pps8k[i]}" \ | ||||
|             "${gguf_pps8k[i]}" \ | ||||
|             "${mlx_tgs8k[i]}" \ | ||||
|             "${gguf_tgs8k[i]}" \ | ||||
|             "${mlx_pps16k[i]}" \ | ||||
|             "${gguf_pps16k[i]}" \ | ||||
|             "${mlx_tgs16k[i]}" \ | ||||
|             "${gguf_tgs16k[i]}" \ | ||||
|             "${mlx_pps32k[i]}" \ | ||||
|             "${gguf_pps32k[i]}" \ | ||||
|             "${mlx_tgs32k[i]}" \ | ||||
|             "${gguf_tgs32k[i]}" | tee -a ${res} | ||||
|     done | ||||
| } | ||||
|  | ||||
| eval_model "meta-llama" "Llama-3.2-1B" | ||||
| eval_model "meta-llama" "Llama-3.2-3B" | ||||
| eval_model "meta-llama" "Llama-3.1-8B" | ||||
|  | ||||
| eval_model "google" "gemma-3-270m" | ||||
| eval_model "google" "gemma-3-1b-pt" | ||||
| eval_model "google" "gemma-3-4b-pt" | ||||
|  | ||||
| # the mlx-ppl.y script does not work with these models - not sure why | ||||
| #eval_model "google" "gemma-3n-E2B" | ||||
| #eval_model "google" "gemma-3n-E4B" | ||||
|  | ||||
| eval_model "Qwen" "Qwen3-0.6B-Base" | ||||
| eval_model "Qwen" "Qwen3-1.7B-Base" | ||||
| eval_model "Qwen" "Qwen3-4B-Base" | ||||
| eval_model "Qwen" "Qwen3-8B-Base" | ||||
| eval_model "Qwen" "Qwen3-30B-A3B-Base" | ||||
							
								
								
									
										120
									
								
								examples/compare-mlx/inspect_model.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										120
									
								
								examples/compare-mlx/inspect_model.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,120 @@ | ||||
| #!/usr/bin/env python3 | ||||
| # generated by Claude | ||||
| """ | ||||
| Script to inspect SafeTensors model files and print tensor information. | ||||
| """ | ||||
|  | ||||
| import json | ||||
| from safetensors import safe_open | ||||
| import os | ||||
| from pathlib import Path | ||||
|  | ||||
| def inspect_safetensors_model(model_dir="."): | ||||
|     """Inspect all SafeTensors files in the model directory.""" | ||||
|  | ||||
|     # First, let's read the index file to see the file structure | ||||
|     index_file = Path(model_dir) / "model.safetensors.index.json" | ||||
|  | ||||
|     if index_file.exists(): | ||||
|         with open(index_file, 'r') as f: | ||||
|             index_data = json.load(f) | ||||
|  | ||||
|         print("=== Model Structure ===") | ||||
|         print(f"Total parameters: {index_data.get('metadata', {}).get('total_size', 'Unknown')}") | ||||
|         print() | ||||
|  | ||||
|         # Get all safetensor files | ||||
|         safetensor_files = set(index_data.get('weight_map', {}).values()) | ||||
|     else: | ||||
|         # If no index file, look for safetensor files directly | ||||
|         safetensor_files = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')] | ||||
|  | ||||
|     # Sort files for consistent output | ||||
|     safetensor_files = sorted(safetensor_files) | ||||
|  | ||||
|     print("=== Tensor Information ===") | ||||
|     print(f"{'Tensor Name':<50} {'Shape':<25} {'Data Type':<15} {'File'}") | ||||
|     print("-" * 110) | ||||
|  | ||||
|     total_tensors = 0 | ||||
|  | ||||
|     for filename in safetensor_files: | ||||
|         filepath = Path(model_dir) / filename | ||||
|         if not filepath.exists(): | ||||
|             continue | ||||
|  | ||||
|         print(f"\n--- {filename} ---") | ||||
|  | ||||
|         # Open and inspect the safetensor file | ||||
|         with safe_open(filepath, framework="pt") as f:  # Use PyTorch framework for better dtype support | ||||
|             tensor_names = f.keys() | ||||
|  | ||||
|             for tensor_name in sorted(tensor_names): | ||||
|                 # Get tensor metadata without loading the full tensor | ||||
|                 tensor_slice = f.get_slice(tensor_name) | ||||
|                 shape = tensor_slice.get_shape() | ||||
|                 dtype = tensor_slice.get_dtype() | ||||
|  | ||||
|                 shape_str = str(tuple(shape)) | ||||
|                 dtype_str = str(dtype) | ||||
|  | ||||
|                 print(f"{tensor_name:<50} {shape_str:<25} {dtype_str:<15} {filename}") | ||||
|                 total_tensors += 1 | ||||
|  | ||||
|     print(f"\nTotal tensors found: {total_tensors}") | ||||
|  | ||||
| def main(): | ||||
|     import argparse | ||||
|  | ||||
|     parser = argparse.ArgumentParser(description="Inspect SafeTensors model files") | ||||
|     parser.add_argument("--model-dir", "-d", default=".", | ||||
|                        help="Directory containing the model files (default: current directory)") | ||||
|     parser.add_argument("--summary", "-s", action="store_true", | ||||
|                        help="Show only summary statistics") | ||||
|  | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     if args.summary: | ||||
|         print_summary_only(args.model_dir) | ||||
|     else: | ||||
|         inspect_safetensors_model(args.model_dir) | ||||
|  | ||||
| def print_summary_only(model_dir="."): | ||||
|     """Print only summary statistics.""" | ||||
|     safetensor_files = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')] | ||||
|  | ||||
|     total_tensors = 0 | ||||
|     dtype_counts = {} | ||||
|     total_params = 0 | ||||
|  | ||||
|     for filename in sorted(safetensor_files): | ||||
|         filepath = Path(model_dir) / filename | ||||
|         if not filepath.exists(): | ||||
|             continue | ||||
|  | ||||
|         with safe_open(filepath, framework="pt") as f:  # Use PyTorch framework | ||||
|             for tensor_name in f.keys(): | ||||
|                 tensor_slice = f.get_slice(tensor_name) | ||||
|                 shape = tensor_slice.get_shape() | ||||
|                 dtype = tensor_slice.get_dtype() | ||||
|  | ||||
|                 total_tensors += 1 | ||||
|  | ||||
|                 dtype_str = str(dtype) | ||||
|                 dtype_counts[dtype_str] = dtype_counts.get(dtype_str, 0) + 1 | ||||
|  | ||||
|                 # Calculate parameter count | ||||
|                 param_count = 1 | ||||
|                 for dim in shape: | ||||
|                     param_count *= dim | ||||
|                 total_params += param_count | ||||
|  | ||||
|     print("=== Model Summary ===") | ||||
|     print(f"Total tensors: {total_tensors}") | ||||
|     print(f"Total parameters: {total_params:,}") | ||||
|     print(f"Data type distribution:") | ||||
|     for dtype, count in sorted(dtype_counts.items()): | ||||
|         print(f"  {dtype}: {count} tensors") | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										305
									
								
								examples/compare-mlx/mlx-ppl.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										305
									
								
								examples/compare-mlx/mlx-ppl.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,305 @@ | ||||
| # Copyright © 2025 Apple Inc. | ||||
| # modified: https://github.com/ml-explore/mlx-lm/blob/60320dc2347d45dc3ca08be90e5255fb9424bb09/mlx_lm/perplexity.py | ||||
| """ | ||||
| Evaluate perplexity (PPL) of pre-trained MLX models in the same way as llama.cpp's llama-perplexity. | ||||
| """ | ||||
|  | ||||
| import argparse | ||||
| import math | ||||
| import os | ||||
| import time | ||||
| import types | ||||
|  | ||||
| import mlx.core as mx | ||||
| import mlx.nn as nn | ||||
| import numpy as np | ||||
|  | ||||
| from mlx_lm.tuner.datasets import load_dataset | ||||
| from mlx_lm.tuner.utils import get_total_parameters | ||||
| from mlx_lm.utils import load | ||||
|  | ||||
|  | ||||
| def load_data( | ||||
|     tokenizer, | ||||
|     data_path: str, | ||||
|     num_samples: int, | ||||
|     sequence_length: int, | ||||
| ): | ||||
|     """ | ||||
|     Load a Hugging‑Face dataset (via mlx‑lm’s dataset utilities) and convert it | ||||
|     into a token tensor of shape (N, sequence_length). | ||||
|     """ | ||||
|     args = types.SimpleNamespace( | ||||
|         hf_dataset={ | ||||
|             "path": data_path, | ||||
|             "train_split": "train", | ||||
|             "valid_split": "train[:1]", | ||||
|         }, | ||||
|         train=True, | ||||
|         test=False, | ||||
|     ) | ||||
|     dataset = load_dataset(args, tokenizer)[0] | ||||
|  | ||||
|     perm = np.random.permutation(len(dataset)).tolist() | ||||
|  | ||||
|     num_tokens = sequence_length * num_samples if num_samples > 0 else float("inf") | ||||
|     data = [] | ||||
|     i = 0 | ||||
|     while len(data) < num_tokens: | ||||
|         tokens, _ = dataset.process(dataset[perm[i]]) | ||||
|         i += 1 | ||||
|         data.extend(tokens) | ||||
|  | ||||
|     # Convert to MX array, truncate to a multiple of `sequence_length` | ||||
|     data = mx.array(data[: (len(data) // sequence_length) * sequence_length]) | ||||
|     data = data.reshape(-1, sequence_length) | ||||
|     if num_samples > 0: | ||||
|         data = data[:num_samples] | ||||
|     return data | ||||
|  | ||||
|  | ||||
| def _tokenize_text(tokenizer, text: str): | ||||
|     """ | ||||
|     Helper that tokenises a string using the MLX‑LM tokenizer. | ||||
|     Supports the common `encode` method or a callable tokenizer. | ||||
|     """ | ||||
|     # Most mlx‑lm tokenizers expose an `encode` method. | ||||
|     if hasattr(tokenizer, "encode"): | ||||
|         tokens = tokenizer.encode(text) | ||||
|     elif callable(tokenizer): | ||||
|         tokens = tokenizer(text) | ||||
|     else: | ||||
|         raise AttributeError( | ||||
|             "Tokenizer does not have an `encode` method nor is it callable." | ||||
|         ) | ||||
|     # Normalise the output to a Python list of ints. | ||||
|     if isinstance(tokens, mx.array): | ||||
|         tokens = tokens.tolist() | ||||
|     return tokens | ||||
|  | ||||
|  | ||||
| # load a raw text file and tokenize it | ||||
| # generated with gpt-oss-120b | ||||
| def load_raw_data( | ||||
|     tokenizer, | ||||
|     raw_path: str, | ||||
|     num_samples: int, | ||||
|     sequence_length: int, | ||||
| ): | ||||
|     """ | ||||
|     Load a raw text file, tokenize it, and reshape into a (N, sequence_length) | ||||
|     tensor suitable for perplexity evaluation. | ||||
|     """ | ||||
|     if not os.path.isfile(raw_path): | ||||
|         raise FileNotFoundError(f"Raw text file not found: {raw_path}") | ||||
|  | ||||
|     # Read the whole file (UTF‑8).  Users can supply any plain‑text corpus. | ||||
|     with open(raw_path, "r", encoding="utf-8") as fp: | ||||
|         raw_text = fp.read() | ||||
|  | ||||
|     # Tokenise the complete text. | ||||
|     token_list = _tokenize_text(tokenizer, raw_text) | ||||
|  | ||||
|     if len(token_list) == 0: | ||||
|         raise ValueError("Tokenisation of the raw file produced no tokens.") | ||||
|  | ||||
|     # Convert to MX array (int32 is sufficient for token IDs). | ||||
|     token_array = mx.array(token_list, dtype=mx.int32) | ||||
|  | ||||
|     # Trim to a length that is an exact multiple of `sequence_length`. | ||||
|     total_len = (token_array.shape[0] // sequence_length) * sequence_length | ||||
|     token_array = token_array[:total_len] | ||||
|  | ||||
|     # Reshape into (num_sequences, sequence_length) | ||||
|     data = token_array.reshape(-1, sequence_length) | ||||
|  | ||||
|     if num_samples > 0: | ||||
|         data = data[:num_samples] | ||||
|  | ||||
|     #print(f"First 4 samples of the data:") | ||||
|     #for j in range(min(4, len(data))): | ||||
|     #    print(f"  Sample {j}: {tokenizer.decode(data[j].tolist())}\n\n-------------------\n\n") | ||||
|  | ||||
|     return data | ||||
|  | ||||
|  | ||||
| def eval_ppl(model, tokenizer, data, batch_size=8): | ||||
|     """ | ||||
|     Evaluate perplexity on a dataset with standard error calculation. | ||||
|  | ||||
|     Args: | ||||
|         model: The model to evaluate. | ||||
|         data: Tokenized data tensor (shape: N x L). | ||||
|         batch_size: Batch size for evaluation. | ||||
|  | ||||
|     Returns: | ||||
|         tuple: (perplexity, standard_error_of_perplexity) | ||||
|     """ | ||||
|     all_losses = [] | ||||
|  | ||||
|     num_batches = (len(data) + batch_size - 1) // batch_size | ||||
|     for i, s in enumerate(range(0, len(data), batch_size)): | ||||
|         batch = data[s : s + batch_size] | ||||
|  | ||||
|         # Set the first token of all samples to the BOS token | ||||
|         if tokenizer.bos_token_id: | ||||
|             batch[:, 0] = tokenizer.bos_token_id | ||||
|  | ||||
|         # compute cross entropy only with the second half of the sequence to match llama.cpp behavior | ||||
|         # ref: https://github.com/ggml-org/llama.cpp/blob/696fccf354e9dbdfbce135bc40b44c9dcc64dda9/tools/perplexity/perplexity.cpp#L527-L541 | ||||
|         # | ||||
|         #start = 0 | ||||
|         start = batch.shape[1] // 2 | ||||
|  | ||||
|         # Forward pass: get logits for all tokens except last | ||||
|         logits = model(batch[:, :-1]).astype(mx.float32) | ||||
|  | ||||
|         # Calculate cross‑entropy loss with next tokens | ||||
|         #losses = nn.losses.cross_entropy(logits, batch[:, 1:], reduction="none") | ||||
|         losses = nn.losses.cross_entropy(logits[:, start:, :], batch[:, start+1:], reduction="none") | ||||
|  | ||||
|         mx.eval(losses) | ||||
|         # Store individual token losses | ||||
|         all_losses.append(losses.flatten()) | ||||
|  | ||||
|         # Progress indicator | ||||
|         if (i + 1) % 1 == 0 or (i + 1) == num_batches: | ||||
|             print(f"  Processed {i + 1}/{num_batches} batches...", end="\r") | ||||
|  | ||||
|     print()  # New line after progress | ||||
|  | ||||
|     # Concatenate all losses into a single array | ||||
|     all_losses = mx.concatenate(all_losses) | ||||
|  | ||||
|     # Calculate mean loss and perplexity | ||||
|     mean_loss = all_losses.mean().item() | ||||
|     ppl = math.exp(mean_loss) | ||||
|  | ||||
|     # Calculate standard error | ||||
|     std_dev = mx.sqrt(mx.var(all_losses, ddof=1)).item() | ||||
|     num_tokens = all_losses.size | ||||
|     standard_error = std_dev / math.sqrt(num_tokens) | ||||
|  | ||||
|     # Delta approximation for standard error of perplexity | ||||
|     standard_error_ppl = ppl * standard_error | ||||
|  | ||||
|     return ppl, standard_error_ppl | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     parser = argparse.ArgumentParser(description="Evaluate perplexity of MLX models") | ||||
|     parser.add_argument( | ||||
|         "--model", | ||||
|         type=str, | ||||
|         required=True, | ||||
|         help="Path to model or Hugging Face model ID", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--batch-size", type=int, default=8, help="Batch size for evaluation" | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--sequence-length", | ||||
|         type=int, | ||||
|         default=512, | ||||
|         help="Sequence length for evaluation", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--num-samples", | ||||
|         type=int, | ||||
|         default=256, | ||||
|         help="Number of samples to use (-1 for all available)", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--data-path", | ||||
|         type=str, | ||||
|         default="allenai/tulu-3-sft-mixture", | ||||
|         help=( | ||||
|             "A Hugging Face dataset compatible with mlx‑lm. " | ||||
|             "Ignored if --raw-path is provided." | ||||
|         ), | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--raw-path", | ||||
|         type=str, | ||||
|         default=None, | ||||
|         help=( | ||||
|             "Path to a local raw‑text file to use for evaluation. " | ||||
|             "If specified, the script skips loading a HF dataset." | ||||
|         ), | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--seed", type=int, default=123, help="Random seed for data sampling" | ||||
|     ) | ||||
|  | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     # Set random seed (used for HF dataset shuffling) | ||||
|     mx.random.seed(args.seed) | ||||
|  | ||||
|     # Load model | ||||
|     print(f"Loading model from {args.model}...") | ||||
|     model, tokenizer = load(args.model) | ||||
|  | ||||
|     # Count parameters | ||||
|     total_params = get_total_parameters(model) | ||||
|     print(f"Model loaded: {total_params/1e6:.1f}M parameters") | ||||
|  | ||||
|     # ---------------------------------------------------------------------- | ||||
|     # Load evaluation data (raw file vs. HF dataset) | ||||
|     # ---------------------------------------------------------------------- | ||||
|     print("\nLoading dataset...") | ||||
|     print(f"  Sequence length: {args.sequence_length}") | ||||
|  | ||||
|     if args.raw_path: | ||||
|         print(f"  Using raw text file: {args.raw_path}") | ||||
|         data = load_raw_data( | ||||
|             tokenizer, | ||||
|             raw_path=args.raw_path, | ||||
|             num_samples=args.num_samples, | ||||
|             sequence_length=args.sequence_length, | ||||
|         ) | ||||
|     else: | ||||
|         print(f"  Using HF dataset: {args.data_path}") | ||||
|         data = load_data( | ||||
|             tokenizer, | ||||
|             data_path=args.data_path, | ||||
|             num_samples=args.num_samples, | ||||
|             sequence_length=args.sequence_length, | ||||
|         ) | ||||
|  | ||||
|     print(f"  Loaded {len(data)} samples") | ||||
|  | ||||
|     # ---------------------------------------------------------------------- | ||||
|     # Evaluate perplexity | ||||
|     # ---------------------------------------------------------------------- | ||||
|     print(f"\nEvaluating perplexity with batch size {args.batch_size}...") | ||||
|     start_time = time.time() | ||||
|  | ||||
|     ppl, se = eval_ppl(model, tokenizer, data, batch_size=args.batch_size) | ||||
|  | ||||
|     eval_time = time.time() - start_time | ||||
|     tokens_evaluated = data.shape[0] * (data.shape[1] - 1)  # B * (L - 1) | ||||
|  | ||||
|     # Print results | ||||
|     print("\n" + "=" * 60) | ||||
|     print("EVALUATION RESULTS") | ||||
|     print("=" * 60) | ||||
|     print(f"Model: {args.model}") | ||||
|     print(f"Perplexity: {ppl:.3f} ± {se:.3f}") | ||||
|     print(f"Evaluation time: {eval_time:.2f} seconds") | ||||
|     print(f"Peak memory: {mx.get_peak_memory() / 1e9:.2f} GB") | ||||
|     print(f"Tokens per second: {tokens_evaluated / eval_time:.0f}") | ||||
|  | ||||
|     # Additional statistics | ||||
|     print(f"\nDataset statistics:") | ||||
|     print(f"  Total samples: {len(data)}") | ||||
|     print(f"  Total tokens: {data.size}") | ||||
|  | ||||
|     # ---------------------------------------------------------------------- | ||||
|     # Done | ||||
|     # ---------------------------------------------------------------------- | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
|  | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov