mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	convert : avoid dequantizing mxfp4 for GPT-OSS (#16756)
This commit is contained in:
		| @@ -8943,6 +8943,13 @@ class SmolLM3Model(LlamaModel): | ||||
| class GptOssModel(TextModel): | ||||
|     model_arch = gguf.MODEL_ARCH.GPT_OSS | ||||
|  | ||||
|     # TODO: remove once MXFP4 is supported more generally | ||||
|     def dequant_model(self): | ||||
|         quant_config = self.hparams.get("quantization_config") | ||||
|         if quant_config is not None and quant_config.get("quant_method") == "mxfp4": | ||||
|             return | ||||
|         return super().dequant_model() | ||||
|  | ||||
|     def transform_nibble_layout(self, tensor): | ||||
|         assert tensor.dtype == torch.uint8 | ||||
|         assert tensor.shape[-1] == 16 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 compilade
					compilade