mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	model-conversion : add qat-q4 quantization targets (#15588)
This commit adds two targets to the Makefile for quantizing of Quantization Aware Trained (QAT) models to Q4_0 format. The motivation for this is that this sets the token embedding and the output tensors data types to Q8_0 instead of the default Q6_K. This is someting that we wish to enforce for QAT Q4_0 models that are to be uploaded to ggml-org on Huggingface to guarantee the best quality.
This commit is contained in:
		| @@ -1,4 +1,5 @@ | |||||||
| # Validation functions | MAKEFLAGS += --no-print-directory | ||||||
|  |  | ||||||
| define validate_model_path | define validate_model_path | ||||||
| 	@if [ -z "$(MODEL_PATH)" ]; then \ | 	@if [ -z "$(MODEL_PATH)" ]; then \ | ||||||
| 		echo "Error: MODEL_PATH must be provided either as:"; \ | 		echo "Error: MODEL_PATH must be provided either as:"; \ | ||||||
| @@ -17,6 +18,13 @@ define validate_embedding_model_path | |||||||
| 	fi | 	fi | ||||||
| endef | endef | ||||||
|  |  | ||||||
|  | define quantize_model | ||||||
|  | 	@CONVERTED_MODEL="$(1)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" \ | ||||||
|  | 	TOKEN_EMBD_TYPE="$(TOKEN_EMBD_TYPE)" OUTPUT_TYPE="$(OUTPUT_TYPE)" \ | ||||||
|  | 	./scripts/utils/quantize.sh "$(1)" "$(QUANTIZED_TYPE)" "$(TOKEN_EMBD_TYPE)" "$(OUTPUT_TYPE)" | ||||||
|  | 	@echo "Export the quantized model path to $(2) variable in your environment" | ||||||
|  | endef | ||||||
|  |  | ||||||
| ### | ### | ||||||
| ### Casual Model targets/recipes | ### Casual Model targets/recipes | ||||||
| ### | ### | ||||||
| @@ -67,9 +75,15 @@ causal-quantize-Q8_0: causal-quantize-model | |||||||
| causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0 | causal-quantize-Q4_0: QUANTIZED_TYPE = Q4_0 | ||||||
| causal-quantize-Q4_0: causal-quantize-model | causal-quantize-Q4_0: causal-quantize-model | ||||||
|  |  | ||||||
|  | # For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the | ||||||
|  | # token embedding and output types to Q8_0 instead of the default Q6_K. | ||||||
|  | causal-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0 | ||||||
|  | causal-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0 | ||||||
|  | causal-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0 | ||||||
|  | causal-quantize-qat-Q4_0: causal-quantize-model | ||||||
|  |  | ||||||
| causal-quantize-model: | causal-quantize-model: | ||||||
| 	@CONVERTED_MODEL="$(CONVERTED_MODEL)" QUANTIZED_TYPE="$(QUANTIZED_TYPE)" ./scripts/utils/quantize.sh ${CONVERTED_MODEL} ${QUANTIZED_TYPE} | 	$(call quantize_model,$(CONVERTED_MODEL),QUANTIZED_MODEL) | ||||||
| 	@echo "Export the quantized model path to QUANTIZED_MODEL variable in your environment" |  | ||||||
|  |  | ||||||
| causal-run-quantized-model: | causal-run-quantized-model: | ||||||
| 	@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL} | 	@QUANTIZED_MODEL="$(QUANTIZED_MODEL)" ./scripts/causal/run-converted-model.sh ${QUANTIZED_MODEL} | ||||||
| @@ -117,9 +131,15 @@ embedding-quantize-Q8_0: embedding-quantize-model | |||||||
| embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0 | embedding-quantize-Q4_0: QUANTIZED_TYPE = Q4_0 | ||||||
| embedding-quantize-Q4_0: embedding-quantize-model | embedding-quantize-Q4_0: embedding-quantize-model | ||||||
|  |  | ||||||
|  | # For Quantization Aware Trained (QAT) models in Q4_0 we explicitly set the | ||||||
|  | # token embedding and output types to Q8_0 instead of the default Q6_K. | ||||||
|  | embedding-quantize-qat-Q4_0: QUANTIZED_TYPE = Q4_0 | ||||||
|  | embedding-quantize-qat-Q4_0: TOKEN_EMBD_TYPE = Q8_0 | ||||||
|  | embedding-quantize-qat-Q4_0: OUTPUT_TYPE = Q8_0 | ||||||
|  | embedding-quantize-qat-Q4_0: embedding-quantize-model | ||||||
|  |  | ||||||
| embedding-quantize-model: | embedding-quantize-model: | ||||||
| 	@./scripts/utils/quantize.sh ${CONVERTED_EMBEDDING_MODEL} ${QUANTIZED_TYPE} | 	$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL) | ||||||
| 	@echo "Export the quantized model path to QUANTIZED_EMBEDDING_MODEL variable in your environment" |  | ||||||
|  |  | ||||||
| embedding-run-quantized-model: | embedding-run-quantized-model: | ||||||
| 	@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL} | 	@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL} | ||||||
|   | |||||||
| @@ -137,6 +137,18 @@ Then the quantized model can be run using the following command: | |||||||
| (venv) $ make causal-run-quantized-model | (venv) $ make causal-run-quantized-model | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | ### Quantizing QAT (Quantization Aware Training) models | ||||||
|  | When quantizing to `Q4_0`, the default data type for the token embedding weights | ||||||
|  | will be `Q6_K`. For models that are going to be uploaded to ggml-org it is | ||||||
|  | recommended to use `Q8_0` instead for the embeddings and output tensors. | ||||||
|  | The reason is that although `Q6_K` is smaller in size, it requires more compute | ||||||
|  | to unpack, which can hurt performance during output generation when the entire | ||||||
|  | embedding matrix must be dequantized to compute vocabulary logits. `Q8_0` | ||||||
|  | provides practically full quality with better computational efficiency. | ||||||
|  | ```console | ||||||
|  | (venv) $ make causal-quantize-qat-Q4_0 | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  |  | ||||||
| ## Embedding Language Model Conversion | ## Embedding Language Model Conversion | ||||||
|  |  | ||||||
| @@ -238,6 +250,18 @@ Then the quantized model can be run using the following command: | |||||||
| (venv) $ make embedding-run-quantized-model | (venv) $ make embedding-run-quantized-model | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | ### Quantizing QAT (Quantization Aware Training) models | ||||||
|  | When quantizing to `Q4_0`, the default data type for the token embedding weights | ||||||
|  | will be `Q6_K`. For models that are going to be uploaded to ggml-org it is | ||||||
|  | recommended to use `Q8_0` instead for the embeddings and output tensors. | ||||||
|  | The reason is that although `Q6_K` is smaller in size, it requires more compute | ||||||
|  | to unpack, which can hurt performance during output generation when the entire | ||||||
|  | embedding matrix must be dequantized to compute vocabulary logits. `Q8_0` | ||||||
|  | provides practically full quality with better computational efficiency. | ||||||
|  | ```console | ||||||
|  | (venv) $ make embedding-quantize-qat-Q4_0 | ||||||
|  | ``` | ||||||
|  |  | ||||||
| ## Perplexity Evaluation | ## Perplexity Evaluation | ||||||
|  |  | ||||||
| ### Simple perplexity evaluation | ### Simple perplexity evaluation | ||||||
|   | |||||||
| @@ -4,6 +4,8 @@ set -e | |||||||
|  |  | ||||||
| CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" | CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" | ||||||
| QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}" | QUANTIZED_TYPE="${2:-"$QUANTIZED_TYPE"}" | ||||||
|  | TOKEN_EMBD_TYPE="${3:-"${TOKEN_EMBD_TYPE}"}" | ||||||
|  | OUTPUT_TYPE="${4:-"${OUTPUT_TYPE}"}" | ||||||
| QUANTIZED_MODEL=$CONVERTED_MODEL | QUANTIZED_MODEL=$CONVERTED_MODEL | ||||||
|  |  | ||||||
| # Final check if we have a model path | # Final check if we have a model path | ||||||
| @@ -14,6 +16,11 @@ if [ -z "$CONVERTED_MODEL" ]; then | |||||||
|     exit 1 |     exit 1 | ||||||
| fi | fi | ||||||
|  |  | ||||||
|  | if [ -z "$QUANTIZED_TYPE" ]; then | ||||||
|  |     echo "Error: QUANTIZED_TYPE is required" >&2 | ||||||
|  |     exit 1 | ||||||
|  | fi | ||||||
|  |  | ||||||
| echo $CONVERTED_MODEL | echo $CONVERTED_MODEL | ||||||
|  |  | ||||||
| # Process the quantized model filename | # Process the quantized model filename | ||||||
| @@ -26,9 +33,16 @@ else | |||||||
|     exit 1 |     exit 1 | ||||||
| fi | fi | ||||||
|  |  | ||||||
|  |  | ||||||
| cmake --build ../../build --target llama-quantize -j8 | cmake --build ../../build --target llama-quantize -j8 | ||||||
|  |  | ||||||
| ../../build/bin/llama-quantize $CONVERTED_MODEL $QUANTIZED_MODEL $QUANTIZED_TYPE | echo $TOKEN_EMBD_TYPE | ||||||
|  | echo $OUTPUT_TYPE | ||||||
|  |  | ||||||
|  | CMD_ARGS=("../../build/bin/llama-quantize") | ||||||
|  | [[ -n "$TOKEN_EMBD_TYPE" ]] && CMD_ARGS+=("--token-embedding-type" "$TOKEN_EMBD_TYPE") | ||||||
|  | [[ -n "$OUTPUT_TYPE" ]]     && CMD_ARGS+=("--output-tensor-type" "$OUTPUT_TYPE") | ||||||
|  | CMD_ARGS+=("$CONVERTED_MODEL" "$QUANTIZED_MODEL" "$QUANTIZED_TYPE") | ||||||
|  |  | ||||||
|  | "${CMD_ARGS[@]}" | ||||||
|  |  | ||||||
| echo "Quantized model saved to: $QUANTIZED_MODEL" | echo "Quantized model saved to: $QUANTIZED_MODEL" | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Daniel Bevenius
					Daniel Bevenius