mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	convert-hf : save memory with lazy evaluation (#7075)
* convert-hf : begin refactoring write_tensor * convert : upgrade to sentencepiece v0.2.0 * convert-hf : remove unused n_dims in extra_*_tensors * convert-hf : simplify MoE weights stacking * convert-hf : flake8 linter doesn't like semicolons * convert-hf : allow unusual model part names For example, loading `model-00001-of-00001.safetensors` now works. * convert-hf : fix stacking MoE expert tensors `torch.stack` and `torch.cat` don't do the same thing. * convert-hf : fix Mamba conversion Tested to work even with a SentencePiece-based tokenizer. * convert : use a string for the SentencePiece tokenizer path * convert-hf : display tensor shape * convert-hf : convert norms to f32 by default * convert-hf : sort model part names `os.listdir` is said to list files in arbitrary order. Sorting the file names should let "model-00009-of-00042.safetensors" be loaded before "model-00010-of-00042.safetensors". * convert-hf : use an ABC for Model again It seems Protocol can't be used as a statically type-checked ABC, because its subclasses also can't be instantiated. (why did it seem to work?) At least there's still a way to throw an error when forgetting to define the `model_arch` property of any registered Model subclasses. * convert-hf : use a plain class for Model, and forbid direct instantiation There are no abstract methods used anyway, so using ABC isn't really necessary. * convert-hf : more consistent formatting of cmdline args * convert-hf : align the message logged for converted tensors * convert-hf : fix Refact conversion * convert-hf : save memory with lazy evaluation * convert-hf : flake8 doesn't like lowercase L as a variable name * convert-hf : remove einops requirement for InternLM2 * convert-hf : faster model parts loading Instead of pre-loading them all into a dict, iterate on the tensors in the model parts progressively as needed in Model.write_tensors Conversion for some architectures relies on checking for the presence of specific tensor names, so for multi-part models, the weight map is read from the relevant json file to quickly get these names up-front. * convert-hf : minor changes for consistency * gguf-py : add tqdm as a dependency It's small, and used for a progress bar in GGUFWriter.write_tensors_to_file
This commit is contained in:
		
							
								
								
									
										20
									
								
								convert.py
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								convert.py
									
									
									
									
									
								
							| @@ -284,6 +284,7 @@ class Params: | ||||
|         n_experts      = None | ||||
|         n_experts_used = None | ||||
|         f_rope_freq_base = None | ||||
|         n_ff = None | ||||
|  | ||||
|         # hack to determine LLaMA v1 vs v2 vs CodeLlama | ||||
|         if config.get("moe"): | ||||
| @@ -308,6 +309,8 @@ class Params: | ||||
|             n_experts_used = config["moe"]["num_experts_per_tok"] | ||||
|             f_rope_freq_base = 1e6 | ||||
|  | ||||
|         assert n_ff is not None | ||||
|  | ||||
|         return Params( | ||||
|             n_vocab          = model["tok_embeddings.weight"].shape[0], | ||||
|             n_embd           = config["dim"], | ||||
| @@ -462,7 +465,8 @@ class SentencePieceVocab(Vocab): | ||||
|             # not found in alternate location either | ||||
|             raise FileNotFoundError('Cannot find tokenizer.model') | ||||
|  | ||||
|         self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) | ||||
|         self.sentencepiece_tokenizer = SentencePieceProcessor() | ||||
|         self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer)) | ||||
|         vocab_size = self.sentencepiece_tokenizer.vocab_size() | ||||
|  | ||||
|         new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} | ||||
| @@ -482,23 +486,23 @@ class SentencePieceVocab(Vocab): | ||||
|     def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: | ||||
|         tokenizer = self.sentencepiece_tokenizer | ||||
|         for i in range(tokenizer.vocab_size()): | ||||
|             piece = tokenizer.id_to_piece(i) | ||||
|             piece = tokenizer.IdToPiece(i) | ||||
|             text         = piece.encode("utf-8") | ||||
|             score: float = tokenizer.get_score(i) | ||||
|             score: float = tokenizer.GetScore(i) | ||||
|  | ||||
|             toktype = gguf.TokenType.NORMAL | ||||
|             if tokenizer.is_unknown(i): | ||||
|             if tokenizer.IsUnknown(i): | ||||
|                 toktype = gguf.TokenType.UNKNOWN | ||||
|             if tokenizer.is_control(i): | ||||
|             if tokenizer.IsControl(i): | ||||
|                 toktype = gguf.TokenType.CONTROL | ||||
|  | ||||
|             # NOTE: I think added_tokens are user defined. | ||||
|             # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto | ||||
|             # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED | ||||
|  | ||||
|             if tokenizer.is_unused(i): | ||||
|             if tokenizer.IsUnused(i): | ||||
|                 toktype = gguf.TokenType.UNUSED | ||||
|             if tokenizer.is_byte(i): | ||||
|             if tokenizer.IsByte(i): | ||||
|                 toktype = gguf.TokenType.BYTE | ||||
|  | ||||
|             yield text, score, toktype | ||||
| @@ -906,7 +910,7 @@ class LazyUnpickler(pickle.Unpickler): | ||||
|     def rebuild_from_type_v2(func, new_type, args, state): | ||||
|         return func(*args) | ||||
|  | ||||
|     CLASSES = { | ||||
|     CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = { | ||||
|         # getattr used here as a workaround for mypy not being smart enough to determine | ||||
|         # the staticmethods have a __func__ attribute. | ||||
|         ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 compilade
					compilade