mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	gguf-py, convert-hf : model conversion support for T5 and FLAN-T5 model variants (#5763)
* gguf-py : add T5 model architecture * gguf-py : add separate tensors for encoder and decoder * gguf-py : add new model header parameters: decoder_start_token_id, attention.relative_buckets_count, tokenizer.ggml.remove_extra_whitespaces, tokenizer.ggml.precompiled_charsmap * convert-hf : add model conversion support for T5ForConditionalGeneration and T5WithLMHeadModel --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
This commit is contained in:
		| @@ -80,7 +80,7 @@ class Model: | ||||
|         if not self.is_safetensors: | ||||
|             self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") | ||||
|         self.hparams = Model.load_hparams(self.dir_model) | ||||
|         self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) | ||||
|         self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) | ||||
|         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) | ||||
|         self.tensor_names = None | ||||
|         if self.ftype == gguf.LlamaFileType.GUESSED: | ||||
| @@ -2771,6 +2771,124 @@ class DeepseekV2Model(Model): | ||||
|                 raise ValueError(f"Unprocessed experts: {experts}") | ||||
|  | ||||
|  | ||||
| @Model.register("T5ForConditionalGeneration") | ||||
| @Model.register("T5WithLMHeadModel") | ||||
| class T5Model(Model): | ||||
|     model_arch = gguf.MODEL_ARCH.T5 | ||||
|  | ||||
|     def set_vocab(self): | ||||
|         # to avoid TypeError: Descriptors cannot be created directly | ||||
|         # exception when importing sentencepiece_model_pb2 | ||||
|         os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" | ||||
|         from sentencepiece import SentencePieceProcessor | ||||
|         from sentencepiece import sentencepiece_model_pb2 as model | ||||
|  | ||||
|         tokenizer_path = self.dir_model / 'spiece.model' | ||||
|  | ||||
|         if not tokenizer_path.is_file(): | ||||
|             raise FileNotFoundError(f"File not found: {tokenizer_path}") | ||||
|  | ||||
|         sentencepiece_model = model.ModelProto() | ||||
|         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) | ||||
|         add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix | ||||
|         remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces | ||||
|         precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap | ||||
|         assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM | ||||
|  | ||||
|         tokenizer = SentencePieceProcessor() | ||||
|         tokenizer.LoadFromFile(str(tokenizer_path)) | ||||
|  | ||||
|         vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) | ||||
|  | ||||
|         tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] | ||||
|         scores: list[float] = [-10000.0] * vocab_size | ||||
|         toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size | ||||
|  | ||||
|         for token_id in range(tokenizer.vocab_size()): | ||||
|             piece = tokenizer.IdToPiece(token_id) | ||||
|             text = piece.encode("utf-8") | ||||
|             score = tokenizer.GetScore(token_id) | ||||
|  | ||||
|             toktype = SentencePieceTokenTypes.NORMAL | ||||
|             if tokenizer.IsUnknown(token_id): | ||||
|                 toktype = SentencePieceTokenTypes.UNKNOWN | ||||
|             elif tokenizer.IsControl(token_id): | ||||
|                 toktype = SentencePieceTokenTypes.CONTROL | ||||
|             elif tokenizer.IsUnused(token_id): | ||||
|                 toktype = SentencePieceTokenTypes.UNUSED | ||||
|             elif tokenizer.IsByte(token_id): | ||||
|                 toktype = SentencePieceTokenTypes.BYTE | ||||
|  | ||||
|             tokens[token_id] = text | ||||
|             scores[token_id] = score | ||||
|             toktypes[token_id] = toktype | ||||
|  | ||||
|         added_tokens_file = self.dir_model / 'added_tokens.json' | ||||
|         if added_tokens_file.is_file(): | ||||
|             with open(added_tokens_file, "r", encoding="utf-8") as f: | ||||
|                 added_tokens_json = json.load(f) | ||||
|                 for key in added_tokens_json: | ||||
|                     token_id = added_tokens_json[key] | ||||
|                     if (token_id >= vocab_size): | ||||
|                         logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') | ||||
|                         continue | ||||
|  | ||||
|                     tokens[token_id] = key.encode("utf-8") | ||||
|                     scores[token_id] = -1000.0 | ||||
|                     toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED | ||||
|  | ||||
|         if vocab_size > len(tokens): | ||||
|             pad_count = vocab_size - len(tokens) | ||||
|             logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") | ||||
|             for i in range(1, pad_count + 1): | ||||
|                 tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) | ||||
|                 scores.append(-1000.0) | ||||
|                 toktypes.append(SentencePieceTokenTypes.UNUSED) | ||||
|  | ||||
|         self.gguf_writer.add_tokenizer_model("t5") | ||||
|         self.gguf_writer.add_tokenizer_pre("default") | ||||
|         self.gguf_writer.add_token_list(tokens) | ||||
|         self.gguf_writer.add_token_scores(scores) | ||||
|         self.gguf_writer.add_token_types(toktypes) | ||||
|         self.gguf_writer.add_add_space_prefix(add_prefix) | ||||
|         self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) | ||||
|         if precompiled_charsmap: | ||||
|             self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) | ||||
|  | ||||
|         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) | ||||
|         special_vocab.add_to_gguf(self.gguf_writer) | ||||
|  | ||||
|         self.gguf_writer.add_add_bos_token(False) | ||||
|         self.gguf_writer.add_add_eos_token(True) | ||||
|  | ||||
|     def set_gguf_parameters(self): | ||||
|         self.gguf_writer.add_name("T5") | ||||
|         self.gguf_writer.add_context_length(self.hparams["n_positions"]) | ||||
|         self.gguf_writer.add_embedding_length(self.hparams["d_model"]) | ||||
|         self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) | ||||
|         self.gguf_writer.add_block_count(self.hparams["num_layers"]) | ||||
|         self.gguf_writer.add_head_count(self.hparams["num_heads"]) | ||||
|         self.gguf_writer.add_key_length(self.hparams["d_kv"]) | ||||
|         self.gguf_writer.add_value_length(self.hparams["d_kv"]) | ||||
|         self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) | ||||
|         self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"]) | ||||
|         self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) | ||||
|         self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"]) | ||||
|         self.gguf_writer.add_file_type(self.ftype) | ||||
|  | ||||
|     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: | ||||
|         del bid  # unused | ||||
|  | ||||
|         # Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or | ||||
|         # "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor | ||||
|         # To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight". | ||||
|         if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight": | ||||
|             logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.") | ||||
|             return [] | ||||
|  | ||||
|         return [(self.map_tensor_name(name), data_torch)] | ||||
|  | ||||
|  | ||||
| ###### CONVERSION LOGIC ###### | ||||
|  | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 fairydreaming
					fairydreaming