mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	unicode : switch to multimap based nfd_map (#5799)
* switch to multimap based nfd_map due to compile time issues * simplify multimap keys * dont construct new locale every time
This commit is contained in:
		
							
								
								
									
										11
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -8947,10 +8947,10 @@ struct llm_tokenizer_wpm { | ||||
|         std::vector<uint32_t> codepoints = codepoints_from_utf8(text); | ||||
|         std::vector<uint32_t> nfd_codepoints; | ||||
|         for (uint32_t code : codepoints) { | ||||
|             auto it = nfd_map.find(code); | ||||
|             if (it != nfd_map.end()) { | ||||
|                 for (uint32_t c : it->second) { | ||||
|                     nfd_codepoints.push_back(c); | ||||
|             auto it = nfd_map.equal_range(code); | ||||
|             if (it.first != it.second) { | ||||
|                 for (auto jt = it.first; jt != it.second; jt++) { | ||||
|                     nfd_codepoints.push_back(jt->second); | ||||
|                 } | ||||
|             } else { | ||||
|                 nfd_codepoints.push_back(code); | ||||
| @@ -9001,12 +9001,13 @@ struct llm_tokenizer_wpm { | ||||
|     } | ||||
|  | ||||
|     uint32_t to_lower(uint32_t code) { | ||||
|         static const std::locale locale("en_US.UTF-8"); | ||||
| #if defined(_WIN32) | ||||
|         if (code > 0xFFFF) { | ||||
|             return code; | ||||
|         } | ||||
| #endif | ||||
|         return std::tolower(wchar_t(code), std::locale("en_US.UTF-8")); | ||||
|         return std::tolower(wchar_t(code), locale); | ||||
|     } | ||||
|  | ||||
|     bool is_ascii_punct(uint32_t code) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Douglas Hanley
					Douglas Hanley