mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	awq-py : remove (#5768)
This commit is contained in:
		
							
								
								
									
										116
									
								
								awq-py/README.md
									
									
									
									
									
								
							
							
						
						
									
										116
									
								
								awq-py/README.md
									
									
									
									
									
								
							| @@ -1,116 +0,0 @@ | ||||
| # AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp | ||||
| [[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)] | ||||
|  | ||||
| **Supported models:** | ||||
|  | ||||
| - [X] LLaMA | ||||
| - [x] LLaMA 2 | ||||
| - [X] MPT | ||||
| - [X] Mistral AI v0.1 | ||||
| - [ ] Bloom | ||||
| - [ ] Mixtral MoE | ||||
|  | ||||
| **TODO:** | ||||
| - [x] Update version work with both MPT and MPT-AWQ model | ||||
| - [ ] Add OPT model | ||||
| - [ ] Add Bloom model | ||||
| - [ ] Add Mixtral MoE | ||||
| - [ ] Support w3, w2 | ||||
|  | ||||
|  | ||||
| ## Contents | ||||
|  | ||||
| - [Install](##Install) | ||||
| - [Convert](##Convert) | ||||
| - [Quantize](##Quantize) | ||||
| - [Test](##Test) | ||||
| - [Benchmark](##Benchmark) | ||||
| - [Results](##Results) | ||||
|  | ||||
| ## Install | ||||
| Install requirements | ||||
| ```bash | ||||
| pip install -r requirements.txt | ||||
| ``` | ||||
| Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT | ||||
| ```bash | ||||
| git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache | ||||
| ``` | ||||
|  | ||||
| ## Convert | ||||
| Example for llama model | ||||
| ```bash | ||||
| # For llama7b and llama2 models | ||||
| python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf | ||||
| # For mistral and mpt models | ||||
| python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf | ||||
| ``` | ||||
|  | ||||
| ## Quantize | ||||
| ```bash | ||||
| # We only benchmark and confirm the results on q4_0, q4_1, and q2_k types. | ||||
| ./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0 | ||||
| ``` | ||||
|  | ||||
| ## Test | ||||
| ```bash | ||||
| # For all models. | ||||
| ./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time" | ||||
| ``` | ||||
|  | ||||
| ## Benchmark | ||||
| The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512. | ||||
| ```bash | ||||
| # For llama and llama2, and mistral models. | ||||
| ./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw | ||||
| ``` | ||||
|  | ||||
| ## Results | ||||
| Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison | ||||
| We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k | ||||
|  | ||||
| ### Llama 7B (Build with OpenBLAS) | ||||
|  | ||||
| | Model      | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   | | ||||
| |-----------:|--------------|-------:|-------:|-------:|-------:| | ||||
| |Llama 7B    | perplexity   | 5.9066 | 6.1214 | 6.0643 | 6.5808 | | ||||
| |Llama 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G | | ||||
| |Llama 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 | | ||||
| |AWQ-LLama 7B| perplexity   | 5.9175 | 6.0252 | 5.9987 | 6.3692 | | ||||
| |AWQ-LLama 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G | | ||||
| |AWQ-LLama 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 | | ||||
|  | ||||
|  | ||||
| ### Llama2 7B (Build with CuBLAS) | ||||
|  | ||||
| | Model       | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   | | ||||
| |------------:|--------------|-------:|-------:|-------:|-------:| | ||||
| |Llama2 7B    | perplexity   | 5.8664 | 6.0260 | 6.0656 | 6.4496 | | ||||
| |Llama2 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G | | ||||
| |Llama2 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 | | ||||
| |AWQ-LLama2 7B| perplexity   | 5.8801 | 6.0054 | 5.9849 | 6.3650 | | ||||
| |AWQ-LLama2 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G | | ||||
| |AWQ-LLama2 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 | | ||||
|  | ||||
|  | ||||
| ### Mistral 7B v0.1 (Build with CuBLAS) | ||||
|  | ||||
| | Model        | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   | | ||||
| |-------------:|--------------|-------:|-------:|-------:|-------:| | ||||
| |Mistral 7B    | perplexity   | 5.6931 | 5.8202 | 5.8268 | 6.1645 | | ||||
| |Mistral 7B    | file size     |  14.5G |   4.1G |   4.5G |   3.1G | | ||||
| |Mistral 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 | | ||||
| |AWQ-Mistral 7B| perplexity   | 5.6934 | 5.8020 | 5.7691 | 6.0426 | | ||||
| |AWQ-Mistral 7B| file size     |  14.5G |   4.1G |   4.5G |   3.1G | | ||||
| |AWQ-Mistral 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 | | ||||
|  | ||||
| ### MPT 7B (Build with OpenBLAS) | ||||
|  | ||||
| | Model    | Measure      | F16    | Q4_0   | Q4_1   | Q2_K    | | ||||
| |---------:|--------------|-------:|-------:|-------:|--------:| | ||||
| |MPT 7B    | perplexity   | 8.4369 | 8.7956 | 8.6265 | 11.4913 | | ||||
| |MPT 7B    | file size    |  13.7G  |   3.9G |   4.3G |   2.8G  | | ||||
| |MPT 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  | | ||||
| |AWQ-MPT 7B| perplexity   | 8.4944 | 8.7053 |  8.6750 | 10.2873| | ||||
| |AWQ-MPT 7B| file size    |  13.7G  |   3.9G |   4.3G |   2.8G  | | ||||
| |AWQ-MPT 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  | | ||||
| @@ -1,254 +0,0 @@ | ||||
| """ | ||||
| Implements the AWQ for llama.cpp use cases. | ||||
| Original paper: https://arxiv.org/abs/2306.00978 | ||||
|  | ||||
| This code is based on versions of the AWQ implementation found in the following repositories: | ||||
| * https://github.com/mit-han-lab/llm-awq | ||||
| * https://github.com/casper-hansen/AutoAWQ | ||||
| """ | ||||
|  | ||||
| import os | ||||
| import torch | ||||
| import torch.nn as nn | ||||
|  | ||||
| from transformers import AutoModelForCausalLM, AutoConfig | ||||
| from transformers.models.bloom.modeling_bloom import BloomGelu | ||||
| from transformers.models.llama.modeling_llama import LlamaRMSNorm | ||||
| from transformers.activations import GELUActivation | ||||
|  | ||||
|  | ||||
| class ScaledActivation(nn.Module): | ||||
|     """ | ||||
|     ScaledActivation module wraps an existing activation function and applies a | ||||
|     scale factor to its output. | ||||
|  | ||||
|     Args: | ||||
|         module (nn.Module): The activation function to be scaled. | ||||
|         scales (torch.Tensor): A tensor of size (num_features,) containing the initial | ||||
|             scale factors for each feature. | ||||
|  | ||||
|     Returns: | ||||
|         torch.Tensor: The scaled output of the activation function. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, module, scales): | ||||
|         super().__init__() | ||||
|         self.act = module | ||||
|         self.scales = nn.Parameter(scales.data) | ||||
|  | ||||
|     def forward(self, x): | ||||
|         return self.act(x) / self.scales.view(1, 1, -1).to(x.device) | ||||
|  | ||||
|  | ||||
| def set_op_by_name(layer, name, new_module): | ||||
|     """ | ||||
|     Set the new module for given module's name. | ||||
|  | ||||
|     Args: | ||||
|         layer (nn.Module): The layer in which to replace the submodule. | ||||
|         name (str): The path to the submodule to be replaced, using dot notation | ||||
|             to access nested modules. | ||||
|         new_module (nn.Module): The new module to replace the existing one. | ||||
|     """ | ||||
|     levels = name.split(".") | ||||
|     if len(levels) > 1: | ||||
|         mod_ = layer | ||||
|         for l_idx in range(len(levels) - 1): | ||||
|             if levels[l_idx].isdigit(): | ||||
|                 mod_ = mod_[int(levels[l_idx])] | ||||
|             else: | ||||
|                 mod_ = getattr(mod_, levels[l_idx]) | ||||
|         setattr(mod_, levels[-1], new_module) | ||||
|     else: | ||||
|         setattr(layer, name, new_module) | ||||
|  | ||||
|  | ||||
| def get_op_by_name(module, op_name): | ||||
|     """ | ||||
|     Retrieves a submodule within a given layer based on its name. | ||||
|  | ||||
|     Args: | ||||
|         module (nn.Module): The layer containing the submodule to find. | ||||
|         op_name (str): The name of the submodule. | ||||
|  | ||||
|     Returns: | ||||
|         nn.Module: The requested submodule found within the given layer. | ||||
|  | ||||
|     Raises: | ||||
|         ValueError: If the specified submodule cannot be found within the layer. | ||||
|     """ | ||||
|     for name, m in module.named_modules(): | ||||
|         if name == op_name: | ||||
|             return m | ||||
|     raise ValueError(f"Cannot find op {op_name} in module {module}") | ||||
|  | ||||
|  | ||||
| @torch.no_grad() | ||||
| def scale_ln_fcs(ln, fcs, scales): | ||||
|     """ | ||||
|     Scales the weights of a LayerNorm and a list of fully-connected layers proportionally. | ||||
|  | ||||
|     Args: | ||||
|         ln (nn.LayerNorm): The LayerNorm module to be scaled. | ||||
|         fcs (List[nn.Linear]): A list of fully-connected layers to be scaled. | ||||
|         scales (torch.Tensor): A 1D tensor of size (num_features,). | ||||
|     """ | ||||
|  | ||||
|     if not isinstance(fcs, list): | ||||
|         fcs = [fcs] | ||||
|  | ||||
|     scales = scales.to(ln.weight.device) | ||||
|  | ||||
|     ln.weight.div_(scales) | ||||
|     if hasattr(ln, "bias") and ln.bias is not None: | ||||
|         ln.bias.div_(scales) | ||||
|  | ||||
|     for fc in fcs: | ||||
|         fc.weight.mul_(scales.view(1, -1)) | ||||
|  | ||||
|     for p in ln.parameters(): | ||||
|         assert torch.isnan(p).sum() == 0 | ||||
|     for fc in fcs: | ||||
|         for p in fc.parameters(): | ||||
|             assert torch.isnan(p).sum() == 0 | ||||
|  | ||||
|  | ||||
| @torch.no_grad() | ||||
| def scale_fc_fc(fc1, fc2, scales): | ||||
|     """ | ||||
|     Scales the weights of two fully-connected layers in a specific pattern. | ||||
|  | ||||
|     Args: | ||||
|         fc1 (nn.Linear): The first fully-connected layer to be scaled. | ||||
|         fc2 (nn.Linear): The second fully-connected layer to be scaled. | ||||
|         scales (torch.Tensor): A 1D tensor of size (num_features,). | ||||
|     """ | ||||
|     assert isinstance(fc1, nn.Linear) | ||||
|     assert isinstance(fc2, nn.Linear) | ||||
|  | ||||
|     scales = scales.to(fc1.weight.device) | ||||
|  | ||||
|     fc1.weight[-scales.size(0):].div_(scales.view(-1, 1)) | ||||
|     if fc1.bias is not None: | ||||
|         fc1.bias.div_(scales.view(-1)) | ||||
|  | ||||
|     fc2.weight.mul_(scales.view(1, -1)) | ||||
|  | ||||
|     for p in fc1.parameters(): | ||||
|         assert torch.isnan(p).sum() == 0 | ||||
|     for p in fc2.parameters(): | ||||
|         assert torch.isnan(p).sum() == 0 | ||||
|  | ||||
|  | ||||
| @torch.no_grad() | ||||
| def scale_gelu_fc(gelu, fc, scales): | ||||
|     """ | ||||
|     Scales the weight of a GELU activation and a fully-connected layer proportionally. | ||||
|  | ||||
|     Args: | ||||
|         gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled. | ||||
|         fc (nn.Linear): The fully-connected layer to be scaled. | ||||
|         scales (torch.Tensor): A 1D tensor of size (num_features,). | ||||
|  | ||||
|     Raises: | ||||
|         TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`. | ||||
|         TypeError: If the `fc` module is not of type `nn.Linear`. | ||||
|     """ | ||||
|     assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation)) | ||||
|     assert isinstance(fc, nn.Linear) | ||||
|  | ||||
|     fc.weight.mul_(scales.view(1, -1).to(fc.weight.device)) | ||||
|  | ||||
|     for p in fc.parameters(): | ||||
|         assert torch.isnan(p).sum() == 0 | ||||
|  | ||||
|  | ||||
| def apply_scale(module, scales_list, input_feat_dict=None): | ||||
|     """ | ||||
|     Applies different scaling strategies to layers based on their type and hierarchy within a given module. | ||||
|  | ||||
|     Args: | ||||
|         module (nn.Module): The module containing the layers to be scaled. | ||||
|         scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing: | ||||
|             * prev_op_name (str): The name of the preceding operation or module, | ||||
|                 relative to which the layers to be scaled are located. | ||||
|             * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation. | ||||
|             * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature. | ||||
|         input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding | ||||
|             input features (optional). | ||||
|     """ | ||||
|     for prev_op_name, layer_names, scales in scales_list: | ||||
|         prev_op = get_op_by_name(module, prev_op_name) | ||||
|         layers = [get_op_by_name(module, name) for name in layer_names] | ||||
|  | ||||
|         prev_op.cuda() | ||||
|         for layer in layers: | ||||
|             layer.cuda() | ||||
|         scales.cuda() | ||||
|  | ||||
|         if isinstance(prev_op, nn.Linear): | ||||
|             assert len(layers) == 1 | ||||
|             scale_fc_fc(prev_op, layers[0], scales) | ||||
|         elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower(): | ||||
|             scale_ln_fcs(prev_op, layers, scales) | ||||
|         elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)): | ||||
|             new_module = ScaledActivation(prev_op, scales) | ||||
|             set_op_by_name(module, prev_op_name, new_module) | ||||
|             scale_gelu_fc(prev_op, layers[0], scales) | ||||
|         else: | ||||
|             raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!") | ||||
|  | ||||
|         # apply the scaling to input feat if given; prepare it for clipping | ||||
|         if input_feat_dict is not None: | ||||
|             for layer_name in layer_names: | ||||
|                 inp = input_feat_dict[layer_name] | ||||
|                 inp.div_(scales.view(1, -1).to(inp.device)) | ||||
|  | ||||
|         prev_op.cpu() | ||||
|         for layer in layers: | ||||
|             layer.cpu() | ||||
|         scales.cpu() | ||||
|  | ||||
|  | ||||
| @torch.no_grad() | ||||
| def apply_clip(module, clip_list): | ||||
|     """ | ||||
|     Applies element-wise clipping to the weight of a specific layer within a given module. | ||||
|  | ||||
|     Args: | ||||
|         module (nn.Module): The module containing the layer to be clipped. | ||||
|         clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing: | ||||
|             * name (str): The name of the layer to be clipped, relative to the root of the module. | ||||
|             * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight. | ||||
|     """ | ||||
|     for name, max_val in clip_list: | ||||
|         layer = get_op_by_name(module, name) | ||||
|         layer.cuda() | ||||
|         max_val = max_val.to(layer.weight.device) | ||||
|         org_shape = layer.weight.shape | ||||
|         layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1) | ||||
|         layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val) | ||||
|         layer.weight.data = layer.weight.data.reshape(org_shape) | ||||
|         layer.cpu() | ||||
|  | ||||
|  | ||||
| def add_scale_weights(model_path, scale_path, tmp_path): | ||||
|     """ | ||||
|     Adds pre-computed Activation Weight Quantization (AWQ) results to a model, | ||||
|     including scaling factors and clipping bounds. | ||||
|  | ||||
|     Args: | ||||
|         model_path (str): Path to the pre-trained model to be equipped with AWQ. | ||||
|         scale_path (str): Path to the AWQ scale factors (.pt file). | ||||
|         tmp_path (str): Path to the temporary directory where the equipped model will be saved. | ||||
|     """ | ||||
|     config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) | ||||
|     model = AutoModelForCausalLM.from_pretrained( | ||||
|         model_path, config=config, trust_remote_code=True | ||||
|     ) | ||||
|     model.eval() | ||||
|     awq_results = torch.load(str(scale_path), map_location="cpu") | ||||
|     apply_scale(model, awq_results["scale"]) | ||||
|     apply_clip(model, awq_results["clip"]) | ||||
|     model.save_pretrained(str(tmp_path)) | ||||
|     os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}") | ||||
| @@ -1,2 +0,0 @@ | ||||
| torch>=2.1.1 | ||||
| transformers>=4.32.0 | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov