mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Fix gemma2 tokenizer convert (#8244)
* fix gemma2 tokenizer convert * remove scores * improve code, fix new line issue
This commit is contained in:
		| @@ -576,7 +576,19 @@ class Model: | ||||
|         special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) | ||||
|         special_vocab.add_to_gguf(self.gguf_writer) | ||||
|  | ||||
|     def _set_vocab_sentencepiece(self): | ||||
|     def _set_vocab_sentencepiece(self, add_to_gguf=True): | ||||
|         tokens, scores, toktypes = self._create_vocab_sentencepiece() | ||||
|  | ||||
|         self.gguf_writer.add_tokenizer_model("llama") | ||||
|         self.gguf_writer.add_tokenizer_pre("default") | ||||
|         self.gguf_writer.add_token_list(tokens) | ||||
|         self.gguf_writer.add_token_scores(scores) | ||||
|         self.gguf_writer.add_token_types(toktypes) | ||||
|  | ||||
|         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) | ||||
|         special_vocab.add_to_gguf(self.gguf_writer) | ||||
|  | ||||
|     def _create_vocab_sentencepiece(self): | ||||
|         from sentencepiece import SentencePieceProcessor | ||||
|  | ||||
|         tokenizer_path = self.dir_model / 'tokenizer.model' | ||||
| @@ -638,14 +650,7 @@ class Model: | ||||
|                 scores.append(-1000.0) | ||||
|                 toktypes.append(SentencePieceTokenTypes.UNUSED) | ||||
|  | ||||
|         self.gguf_writer.add_tokenizer_model("llama") | ||||
|         self.gguf_writer.add_tokenizer_pre("default") | ||||
|         self.gguf_writer.add_token_list(tokens) | ||||
|         self.gguf_writer.add_token_scores(scores) | ||||
|         self.gguf_writer.add_token_types(toktypes) | ||||
|  | ||||
|         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) | ||||
|         special_vocab.add_to_gguf(self.gguf_writer) | ||||
|         return tokens, scores, toktypes | ||||
|  | ||||
|     def _set_vocab_llama_hf(self): | ||||
|         vocab = gguf.LlamaHfVocab(self.dir_model) | ||||
| @@ -2345,7 +2350,19 @@ class Gemma2Model(Model): | ||||
|     model_arch = gguf.MODEL_ARCH.GEMMA2 | ||||
|  | ||||
|     def set_vocab(self): | ||||
|         self._set_vocab_llama_hf() | ||||
|         tokens, scores, toktypes = self._create_vocab_sentencepiece() | ||||
|         # hack: This is required so that we can properly use start/end-of-turn for chat template | ||||
|         for i in range(108): | ||||
|             # including <unusedX>, <start_of_turn>, <end_of_turn> | ||||
|             toktypes[i] = SentencePieceTokenTypes.CONTROL | ||||
|         self.gguf_writer.add_tokenizer_model("llama") | ||||
|         self.gguf_writer.add_tokenizer_pre("default") | ||||
|         self.gguf_writer.add_token_list(tokens) | ||||
|         self.gguf_writer.add_token_scores(scores) | ||||
|         self.gguf_writer.add_token_types(toktypes) | ||||
|  | ||||
|         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) | ||||
|         special_vocab.add_to_gguf(self.gguf_writer) | ||||
|         self.gguf_writer.add_add_space_prefix(False) | ||||
|  | ||||
|     def set_gguf_parameters(self): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Xuan Son Nguyen
					Xuan Son Nguyen