mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : lookup word in vocab before doing BPE merges (#7193)
* fix: llama-3 ignore_merges * test: add test for llama-3 bpe ignore_merges * fix: set ignore_merges only for llama-3 * fix: test-tokenizer-1-bpe --ingore-merges detection * fix: copy to fix fallthrough * fix: change ignore_merges to bool * fix: add ignore merges tests to cmake * llama : alternative merge ignore logic --------- Co-authored-by: Haoxiang Fei <feihaoxiang@idea.edu.cn> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										14
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -12253,13 +12253,14 @@ struct llm_tokenizer_bpe { | ||||
|  | ||||
|     void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { | ||||
|         int final_prev_index = -1; | ||||
|         bool ignore_merges = false; | ||||
|  | ||||
|         std::vector<std::string> word_collection; | ||||
|         switch (vocab.type) { | ||||
|             case LLAMA_VOCAB_TYPE_BPE: | ||||
|                 switch (vocab.type_pre) { | ||||
|                     case LLAMA_VOCAB_PRE_TYPE_LLAMA3: | ||||
|                     case LLAMA_VOCAB_PRE_TYPE_DBRX: | ||||
|                         ignore_merges = true; | ||||
|                         word_collection = unicode_regex_split(text, { | ||||
|                             // original regex from tokenizer.json | ||||
|                             //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", | ||||
| @@ -12268,6 +12269,12 @@ struct llm_tokenizer_bpe { | ||||
|                             "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", | ||||
|                         }); | ||||
|                         break; | ||||
|                     case LLAMA_VOCAB_PRE_TYPE_DBRX: | ||||
|                         word_collection = unicode_regex_split(text, { | ||||
|                             // same as llama3 | ||||
|                             "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", | ||||
|                         }); | ||||
|                         break; | ||||
|                     case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: | ||||
|                         word_collection = unicode_regex_split(text, { | ||||
|                             "[\r\n]", | ||||
| @@ -12351,6 +12358,11 @@ struct llm_tokenizer_bpe { | ||||
|             int index = 0; | ||||
|             size_t offset = 0; | ||||
|  | ||||
|             if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) { | ||||
|                 symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()}); | ||||
|                 offset = word.size(); | ||||
|             } | ||||
|  | ||||
|             while (offset < word.size()) { | ||||
|                 llm_symbol sym; | ||||
|                 size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset])); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Haoxiang Fei
					Haoxiang Fei