mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	quantize : improve tensor-type pattern matching (#13033)
This commit is contained in:
		| @@ -14,6 +14,12 @@ | ||||
| #include <thread> | ||||
| #include <unordered_map> | ||||
|  | ||||
| // Quantization types. Changes to this struct must be replicated in quantize.cpp | ||||
| struct tensor_quantization { | ||||
|     std::string name; | ||||
|     ggml_type quant = GGML_TYPE_COUNT; | ||||
| }; | ||||
|  | ||||
| static void zeros(std::ofstream & file, size_t n) { | ||||
|     char zero = 0; | ||||
|     for (size_t i = 0; i < n; ++i) { | ||||
| @@ -48,12 +54,6 @@ struct quantize_state_impl { | ||||
|         {} | ||||
| }; | ||||
|  | ||||
| // changes to this struct must be replicated in quantize.cpp | ||||
| struct tensor_quantization { | ||||
|     std::string name; | ||||
|     ggml_type quant = GGML_TYPE_COUNT; | ||||
| }; | ||||
|  | ||||
| static void llama_tensor_dequantize_impl( | ||||
|     ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers, | ||||
|     const size_t nelements, const int nthread | ||||
| @@ -796,17 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: | ||||
|                 // unless the user specifies a type | ||||
|                 if (params->tensor_types) { | ||||
|                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types); | ||||
|                     const std::string tensor_name(tensor->name); | ||||
|                     for (const auto & [tname, qtype] : tensor_types) { | ||||
|                         if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) { | ||||
|                             if (qtype != new_type) { | ||||
|                                 LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype)); | ||||
|                         if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { | ||||
|                             if  (qtype != new_type) { | ||||
|                                 LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type)); | ||||
|                                 new_type = qtype; | ||||
|                                 break; // if two or more types are specified for the tensor, first match wins | ||||
|                             } | ||||
|                             new_type = qtype; | ||||
|                             break; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { | ||||
|                 new_type = params->token_embedding_type; | ||||
|             } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Ed Addario
					Ed Addario