mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			132 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			132 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python3
 | |
| 
 | |
| """Script to execute the "quantize" script on a given set of models."""
 | |
| 
 | |
| import subprocess
 | |
| import argparse
 | |
| import glob
 | |
| import sys
 | |
| import os
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     """Update the quantize binary name depending on the platform and parse
 | |
|     the command line arguments and execute the script.
 | |
|     """
 | |
| 
 | |
|     if "linux" in sys.platform or "darwin" in sys.platform:
 | |
|         quantize_script_binary = "quantize"
 | |
| 
 | |
|     elif "win32" in sys.platform or "cygwin" in sys.platform:
 | |
|         quantize_script_binary = "quantize.exe"
 | |
| 
 | |
|     else:
 | |
|         print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
 | |
|         quantize_script_binary = "quantize"
 | |
| 
 | |
|     parser = argparse.ArgumentParser(
 | |
|         prog='python3 quantize.py',
 | |
|         description='This script quantizes the given models by applying the '
 | |
|         f'"{quantize_script_binary}" script on them.'
 | |
|     )
 | |
|     parser.add_argument(
 | |
|         'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
 | |
|         help='The models to quantize.'
 | |
|     )
 | |
|     parser.add_argument(
 | |
|         '-r', '--remove-16', action='store_true', dest='remove_f16',
 | |
|         help='Remove the f16 model after quantizing it.'
 | |
|     )
 | |
|     parser.add_argument(
 | |
|         '-m', '--models-path', dest='models_path',
 | |
|         default=os.path.join(os.getcwd(), "models"),
 | |
|         help='Specify the directory where the models are located.'
 | |
|     )
 | |
|     parser.add_argument(
 | |
|         '-q', '--quantize-script-path', dest='quantize_script_path',
 | |
|         default=os.path.join(os.getcwd(), quantize_script_binary),
 | |
|         help='Specify the path to the "quantize" script.'
 | |
|     )
 | |
| 
 | |
|     # TODO: Revise this code
 | |
|     # parser.add_argument(
 | |
|     #     '-t', '--threads', dest='threads', type='int',
 | |
|     #     default=os.cpu_count(),
 | |
|     #     help='Specify the number of threads to use to quantize many models at '
 | |
|     #     'once. Defaults to os.cpu_count().'
 | |
|     # )
 | |
| 
 | |
|     args = parser.parse_args()
 | |
|     args.models_path = os.path.abspath(args.models_path)
 | |
| 
 | |
|     if not os.path.isfile(args.quantize_script_path):
 | |
|         print(
 | |
|             f'The "{quantize_script_binary}" script was not found in the '
 | |
|             "current location.\nIf you want to use it from another location, "
 | |
|             "set the --quantize-script-path argument from the command line."
 | |
|         )
 | |
|         sys.exit(1)
 | |
| 
 | |
|     for model in args.models:
 | |
|         # The model is separated in various parts
 | |
|         # (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
 | |
|         f16_model_path_base = os.path.join(
 | |
|             args.models_path, model, "ggml-model-f16.bin"
 | |
|         )
 | |
| 
 | |
|         if not os.path.isfile(f16_model_path_base):
 | |
|             print(f'The file %s was not found' % f16_model_path_base)
 | |
|             sys.exit(1)
 | |
| 
 | |
|         f16_model_parts_paths = map(
 | |
|             lambda filename: os.path.join(f16_model_path_base, filename),
 | |
|             glob.glob(f"{f16_model_path_base}*")
 | |
|         )
 | |
| 
 | |
|         for f16_model_part_path in f16_model_parts_paths:
 | |
|             if not os.path.isfile(f16_model_part_path):
 | |
|                 print(
 | |
|                     f"The f16 model {os.path.basename(f16_model_part_path)} "
 | |
|                     f"was not found in {args.models_path}{os.path.sep}{model}"
 | |
|                     ". If you want to use it from another location, set the "
 | |
|                     "--models-path argument from the command line."
 | |
|                 )
 | |
|                 sys.exit(1)
 | |
| 
 | |
|             __run_quantize_script(
 | |
|                 args.quantize_script_path, f16_model_part_path
 | |
|             )
 | |
| 
 | |
|             if args.remove_f16:
 | |
|                 os.remove(f16_model_part_path)
 | |
| 
 | |
| 
 | |
| # This was extracted to a top-level function for parallelization, if
 | |
| # implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
 | |
| 
 | |
| def __run_quantize_script(script_path, f16_model_part_path):
 | |
|     """Run the quantize script specifying the path to it and the path to the
 | |
|     f16 model to quantize.
 | |
|     """
 | |
| 
 | |
|     new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
 | |
|     subprocess.run(
 | |
|         [script_path, f16_model_part_path, new_quantized_model_path, "2"],
 | |
|         check=True
 | |
|     )
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     try:
 | |
|         main()
 | |
| 
 | |
|     except subprocess.CalledProcessError:
 | |
|         print("\nAn error ocurred while trying to quantize the models.")
 | |
|         sys.exit(1)
 | |
| 
 | |
|     except KeyboardInterrupt:
 | |
|         sys.exit(0)
 | |
| 
 | |
|     else:
 | |
|         print("\nSuccesfully quantized all models.")
 | 
