mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	Option to split during conversion (#6942)
* support splits in convert.py * Support split by size and dry run to write estimated shards/filesizes * Move split functionality to new GGUFManager class * fix improper function signature * tentative push of convert-hf-to-gguf support * resolve merge + SplitArguments for easier parsing * Fix eager tensor memory leak and remove convert.py changes Removed a memory leak caused by unexpected reference retention to eager tensors. Also removed GGUFManager functionality in convert.py in favor of specializing for convert-hf-to-gguf.py. * refactor SplitStrategy to be a deque Instead of having SplitStrategy have a `data` field that is a deque, just have SplitStrategy be a subclass of deque itself. * fix Q8 quantization * remove unnecessary imports in gguf_manager * fix final? merge issue * fix gguf_writer placement and remove comments * oops, actually fix gguf_writer placement * reduce duplicated code from gguf_writer * further simplify GGUFManager * simplify even further and standardize with GGUFWriter * reduce diffs with master * form shards while adding tensors, SHA256 sums agree with master * re-add type hint Co-authored-by: compilade <git@compilade.net> * GGUFWriter compatibility fix Co-authored-by: compilade <git@compilade.net> * Shard dataclass and un-negative dont_add_architecture * type consistency in format_n_bytes_to_str * move kv keys to constants.py * make pathlib explicit * base-1024 bytes to base-1000 * rename GGUFManager to GGUFWriterSplit * Update gguf-py/gguf/constants.py Co-authored-by: compilade <git@compilade.net> * fix convert-hf-to-gguf.py permissions * fix line endings * Update gguf-py/gguf/gguf_writer_split.py Co-authored-by: compilade <git@compilade.net> * convert-hf : restore executable file permission * examples/convert-legacy-llama.py: restore executable file permission * reinstate original gguf package import and fix type annotation * attempt to appease the linter * attempt 2 to appease the linter * attempt 3 to appease the linter * comma consistency * Update convert-hf-to-gguf.py Co-authored-by: compilade <git@compilade.net> * edit cmd line args * use simplification from #7827 * kv/ti data are still wrong * try to refactor kv data (still fails) * fix ti data messiness * tidy up * fix linting * actually make the linter happy * cleanup round 1 * remove SplitStrategy, SplitArguments * appease linter * fix typing and clean up * fix linting * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * progress bar, fix split logic * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * catch oversights * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * swap bar orders * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * compatibility fix * Update gguf-py/gguf/gguf_writer.py Co-authored-by: compilade <git@compilade.net> * Update convert-hf-to-gguf.py Co-authored-by: compilade <git@compilade.net> --------- Co-authored-by: Brian <mofosyne@gmail.com> Co-authored-by: compilade <git@compilade.net>
This commit is contained in:
		
				
					committed by
					
						
						GitHub
					
				
			
			
				
	
			
			
			
						parent
						
							8cb508d0d5
						
					
				
				
					commit
					52fc8705a0
				
			@@ -65,7 +65,8 @@ class Model:
 | 
			
		||||
    # subclasses should define this!
 | 
			
		||||
    model_arch: gguf.MODEL_ARCH
 | 
			
		||||
 | 
			
		||||
    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
 | 
			
		||||
    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
 | 
			
		||||
                 model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
 | 
			
		||||
        if type(self) is Model:
 | 
			
		||||
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
 | 
			
		||||
        self.dir_model = dir_model
 | 
			
		||||
@@ -96,7 +97,8 @@ class Model:
 | 
			
		||||
        ftype_lw: str = ftype_up.lower()
 | 
			
		||||
        # allow templating the file name with the output ftype, useful with the "auto" ftype
 | 
			
		||||
        self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
 | 
			
		||||
        self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
 | 
			
		||||
        self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
 | 
			
		||||
                                           split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def __init_subclass__(cls):
 | 
			
		||||
@@ -332,6 +334,8 @@ class Model:
 | 
			
		||||
        self.gguf_writer.close()
 | 
			
		||||
 | 
			
		||||
    def write_vocab(self):
 | 
			
		||||
        if len(self.gguf_writer.tensors) != 1:
 | 
			
		||||
            raise ValueError('Splitting the vocabulary is not supported')
 | 
			
		||||
        self.gguf_writer.write_header_to_file(self.fname_out)
 | 
			
		||||
        self.gguf_writer.write_kv_data_to_file()
 | 
			
		||||
        self.gguf_writer.close()
 | 
			
		||||
@@ -2974,10 +2978,44 @@ def parse_args() -> argparse.Namespace:
 | 
			
		||||
        "--verbose", action="store_true",
 | 
			
		||||
        help="increase output verbosity",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--split-max-tensors", type=int, default=0,
 | 
			
		||||
        help="max tensors in each split",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--split-max-size", type=str, default="0",
 | 
			
		||||
        help="max size per split N(M|G)",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--dry-run", action="store_true",
 | 
			
		||||
        help="only print out a split plan and exit, without writing any new files",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--no-tensor-first-split", action="store_true",
 | 
			
		||||
        help="do not add tensors to the first split (disabled by default)"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    return parser.parse_args()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def split_str_to_n_bytes(split_str: str) -> int:
 | 
			
		||||
    if split_str.endswith("K"):
 | 
			
		||||
        n = int(split_str[:-1]) * 1000
 | 
			
		||||
    elif split_str.endswith("M"):
 | 
			
		||||
        n = int(split_str[:-1]) * 1000 * 1000
 | 
			
		||||
    elif split_str.endswith("G"):
 | 
			
		||||
        n = int(split_str[:-1]) * 1000 * 1000 * 1000
 | 
			
		||||
    elif split_str.isnumeric():
 | 
			
		||||
        n = int(split_str)
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
 | 
			
		||||
 | 
			
		||||
    if n < 0:
 | 
			
		||||
        raise ValueError(f"Invalid split size: {split_str}, must be positive")
 | 
			
		||||
 | 
			
		||||
    return n
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main() -> None:
 | 
			
		||||
    args = parse_args()
 | 
			
		||||
 | 
			
		||||
@@ -3010,6 +3048,10 @@ def main() -> None:
 | 
			
		||||
        "auto": gguf.LlamaFileType.GUESSED,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
 | 
			
		||||
        logger.error("Error: Cannot use temp file when splitting")
 | 
			
		||||
        sys.exit(1)
 | 
			
		||||
 | 
			
		||||
    if args.outfile is not None:
 | 
			
		||||
        fname_out = args.outfile
 | 
			
		||||
    else:
 | 
			
		||||
@@ -3027,7 +3069,10 @@ def main() -> None:
 | 
			
		||||
            logger.error(f"Model {hparams['architectures'][0]} is not supported")
 | 
			
		||||
            sys.exit(1)
 | 
			
		||||
 | 
			
		||||
        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
 | 
			
		||||
        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
 | 
			
		||||
                                     args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
 | 
			
		||||
                                     split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
 | 
			
		||||
                                     small_first_shard=args.no_tensor_first_split)
 | 
			
		||||
 | 
			
		||||
        logger.info("Set model parameters")
 | 
			
		||||
        model_instance.set_gguf_parameters()
 | 
			
		||||
@@ -3038,13 +3083,13 @@ def main() -> None:
 | 
			
		||||
        model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
 | 
			
		||||
 | 
			
		||||
        if args.vocab_only:
 | 
			
		||||
            logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
 | 
			
		||||
            logger.info("Exporting model vocab...")
 | 
			
		||||
            model_instance.write_vocab()
 | 
			
		||||
            logger.info("Model vocab successfully exported.")
 | 
			
		||||
        else:
 | 
			
		||||
            logger.info(f"Exporting model to '{model_instance.fname_out}'")
 | 
			
		||||
            logger.info("Exporting model...")
 | 
			
		||||
            model_instance.write()
 | 
			
		||||
 | 
			
		||||
        logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
 | 
			
		||||
            logger.info("Model successfully exported.")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user