mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	vocab : ignore invalid UTF-8 input in the BPE tokenizer (#11729)
Silently insert U+FFFD(s) (Unicode replacement character) instead until the next valid codepoint can be found. This fixes `llama_tokenize` throwing an exception across the C API boundary or libllama's module boundary (the caller's runtime might be incompatible!) Returing a proper error code might be desirable, however the signature of `llama_tokenize` doesn't allow it as all return values already have existing meaning.
This commit is contained in:
		 Christian Fillion
					Christian Fillion
				
			
				
					committed by
					
						 GitHub
						GitHub
					
				
			
			
				
	
			
			
			 GitHub
						GitHub
					
				
			
						parent
						
							333820d749
						
					
				
				
					commit
					2d219b389e
				
			| @@ -618,7 +618,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) { | ||||
|     result.reserve(utf8.size()); | ||||
|     size_t offset = 0; | ||||
|     while (offset < utf8.size()) { | ||||
|         result.push_back(unicode_cpt_from_utf8(utf8, offset)); | ||||
|         try { | ||||
|             result.push_back(unicode_cpt_from_utf8(utf8, offset)); | ||||
|         } | ||||
|         catch (const std::invalid_argument & /*ex*/) { | ||||
|             // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize | ||||
|             ++offset; | ||||
|             result.emplace_back(0xFFFD); // replacement character | ||||
|         } | ||||
|     } | ||||
|     return result; | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user