mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	convert : avoid dequantizing mxfp4 for GPT-OSS
This commit is contained in:
		| @@ -8943,6 +8943,13 @@ class SmolLM3Model(LlamaModel): | |||||||
| class GptOssModel(TextModel): | class GptOssModel(TextModel): | ||||||
|     model_arch = gguf.MODEL_ARCH.GPT_OSS |     model_arch = gguf.MODEL_ARCH.GPT_OSS | ||||||
|  |  | ||||||
|  |     # TODO: remove once MXFP4 is supported more generally | ||||||
|  |     def dequant_model(self): | ||||||
|  |         quant_config = self.hparams.get("quantization_config") | ||||||
|  |         if quant_config is not None and quant_config.get("quant_method") == "mxfp4": | ||||||
|  |             return | ||||||
|  |         return super().dequant_model() | ||||||
|  |  | ||||||
|     def transform_nibble_layout(self, tensor): |     def transform_nibble_layout(self, tensor): | ||||||
|         assert tensor.dtype == torch.uint8 |         assert tensor.dtype == torch.uint8 | ||||||
|         assert tensor.shape[-1] == 16 |         assert tensor.shape[-1] == 16 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Francis Couture-Harpin
					Francis Couture-Harpin