mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	10X faster BPE tokenizer (#2876)
* 10X faster BPE tokenizer * Remove comment that no longer applies --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
		
							
								
								
									
										19
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										19
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -3211,7 +3211,7 @@ private: | |||||||
|  |  | ||||||
| struct llm_bigram_bpe { | struct llm_bigram_bpe { | ||||||
|     struct comparator { |     struct comparator { | ||||||
|         bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) { |         bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const { | ||||||
|             return l.rank > r.rank || (l.rank == r.rank && l.left > r.left); |             return l.rank > r.rank || (l.rank == r.rank && l.left > r.left); | ||||||
|         } |         } | ||||||
|     }; |     }; | ||||||
| @@ -3359,23 +3359,22 @@ private: | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     // probably not 100% correct |     // probably not 100% correct | ||||||
|     // TODO: this is quite slow - how to make it more efficient? |     static std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) { | ||||||
|     static std::vector<std::string> bpe_gpt2_preprocess(std::string text) { |  | ||||||
|         std::vector<std::string> words; |         std::vector<std::string> words; | ||||||
|  |  | ||||||
|         // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 |         // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 | ||||||
|         const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; |         const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; | ||||||
|         const std::regex re(pattern); |         const std::regex re(pattern); | ||||||
|         std::smatch m; |  | ||||||
|  |  | ||||||
|         while (std::regex_search(text, m, re)) { |         auto words_begin = std::sregex_iterator(text.begin(), text.end(), re); | ||||||
|             for (auto x : m) { |         auto words_end = std::sregex_iterator(); | ||||||
|                 words.push_back(x); |         auto n_words = std::distance(words_begin, words_end); | ||||||
|  |         words.reserve(n_words); | ||||||
|  |         for (auto it = words_begin; it != words_end; ++it) { | ||||||
|  |             words.push_back(it->str()); | ||||||
|         } |         } | ||||||
|             text = m.suffix(); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         return words; |         return words; | ||||||
|  |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     const llama_vocab & vocab; |     const llama_vocab & vocab; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Kawrakow
					Kawrakow