mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Allow quantize to only copy tensors, some other improvements (#2931)
* Allow quantize tool to only copy tensors to allow repackaging models. * Slightly better logic when requantizing. * Change help message to go to `stdout`.
This commit is contained in:
		| @@ -35,6 +35,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = { | ||||
|     { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, | ||||
|     { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", }, | ||||
|     { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", }, | ||||
|     // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. | ||||
|     { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", }, | ||||
| }; | ||||
|  | ||||
|  | ||||
| @@ -71,12 +73,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: | ||||
| //  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] | ||||
| // | ||||
| void usage(const char * executable) { | ||||
|     fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); | ||||
|     fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); | ||||
|     fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); | ||||
|     fprintf(stderr, "\nAllowed quantization types:\n"); | ||||
|     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); | ||||
|     printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); | ||||
|     printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); | ||||
|     printf("\nAllowed quantization types:\n"); | ||||
|     for (auto & it : QUANT_OPTIONS) { | ||||
|         printf("  %2d  or  %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str()); | ||||
|         if (it.name != "COPY") { | ||||
|             printf("  %2d  or  ", it.ftype); | ||||
|         } else { | ||||
|             printf("          "); | ||||
|         } | ||||
|         printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str()); | ||||
|     } | ||||
|     exit(1); | ||||
| } | ||||
| @@ -121,6 +128,9 @@ int main(int argc, char ** argv) { | ||||
|         // export as [inp path]/ggml-model-[ftype].gguf | ||||
|         fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; | ||||
|         arg_idx++; | ||||
|         if (ftype_str == "COPY") { | ||||
|             params.only_copy = true; | ||||
|         } | ||||
|     } | ||||
|     else { | ||||
|         fname_out = argv[arg_idx]; | ||||
| @@ -133,6 +143,10 @@ int main(int argc, char ** argv) { | ||||
|         if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { | ||||
|             fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); | ||||
|             return 1; | ||||
|         } else { | ||||
|             if (ftype_str == "COPY") { | ||||
|                params.only_copy = true; | ||||
|             } | ||||
|         } | ||||
|         arg_idx++; | ||||
|     } | ||||
|   | ||||
							
								
								
									
										25
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -4683,6 +4683,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|     llm_load_arch(*ml, model); | ||||
|     llm_load_hparams(*ml, model, 0, 0, 0); | ||||
|  | ||||
|     if (params->only_copy) { | ||||
|         ftype = model.ftype; | ||||
|     } | ||||
|  | ||||
|     const size_t align = GGUF_DEFAULT_ALIGNMENT; | ||||
|     struct gguf_context * ctx_out = gguf_init_empty(); | ||||
|  | ||||
| @@ -4769,18 +4773,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|         // quantize only 2D tensors | ||||
|         quantize &= (tensor->n_dims == 2); | ||||
|         quantize &= params->quantize_output_tensor || name != "output.weight"; | ||||
|         quantize &= quantized_type != tensor->type; | ||||
|         quantize &= !params->only_copy; | ||||
|  | ||||
|         enum ggml_type new_type; | ||||
|         void * new_data; | ||||
|         size_t new_size; | ||||
|  | ||||
|         if (!quantize) { | ||||
|             new_type = tensor->type; | ||||
|             new_data = tensor->data; | ||||
|             new_size = ggml_nbytes(tensor); | ||||
|             LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); | ||||
|         } else { | ||||
|         if (quantize) { | ||||
|             new_type = quantized_type; | ||||
| #ifdef GGML_USE_K_QUANTS | ||||
|             // TODO: avoid hardcoded tensor names - use the TN_* constants | ||||
| @@ -4879,7 +4878,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|                 } | ||||
|             } | ||||
| #endif | ||||
|  | ||||
|             // If we've decided to quantize to the same type the tensor is already | ||||
|             // in then there's nothing to do. | ||||
|             quantize = tensor->type != new_type; | ||||
|         } | ||||
|         if (!quantize) { | ||||
|             new_type = tensor->type; | ||||
|             new_data = tensor->data; | ||||
|             new_size = ggml_nbytes(tensor); | ||||
|             LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); | ||||
|         } else { | ||||
|             const size_t nelements = ggml_nelements(tensor); | ||||
|  | ||||
|             float * f32_data; | ||||
| @@ -5310,6 +5318,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { | ||||
|         /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1, | ||||
|         /*.allow_requantize            =*/ false, | ||||
|         /*.quantize_output_tensor      =*/ true, | ||||
|         /*.only_copy                   =*/ false, | ||||
|     }; | ||||
|  | ||||
|     return result; | ||||
|   | ||||
							
								
								
									
										1
									
								
								llama.h
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								llama.h
									
									
									
									
									
								
							| @@ -164,6 +164,7 @@ extern "C" { | ||||
|         enum llama_ftype ftype;      // quantize to this llama_ftype | ||||
|         bool allow_requantize;       // allow quantizing non-f32/f16 tensors | ||||
|         bool quantize_output_tensor; // quantize output.weight | ||||
|         bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored | ||||
|     } llama_model_quantize_params; | ||||
|  | ||||
|     // grammar types | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kerfuffle
					Kerfuffle