mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	vocab : prevent tokenizer overflow (#14301)
* vocab : prevent stack overflow in tokenize * vocab : return error instead of aborting on oversized token count * vocab : INT32_MIN from llama_tokenize on overflow
This commit is contained in:
		| @@ -1290,6 +1290,9 @@ std::vector<llama_token> common_tokenize( | ||||
|     int n_tokens = text.length() + 2 * add_special; | ||||
|     std::vector<llama_token> result(n_tokens); | ||||
|     n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); | ||||
|     if (n_tokens == std::numeric_limits<int32_t>::min()) { | ||||
|         throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit"); | ||||
|     } | ||||
|     if (n_tokens < 0) { | ||||
|         result.resize(-n_tokens); | ||||
|         int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); | ||||
|   | ||||
| @@ -1088,6 +1088,7 @@ extern "C" { | ||||
|     /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. | ||||
|     /// @return Returns the number of tokens on success, no more than n_tokens_max | ||||
|     /// @return Returns a negative number on failure - the number of tokens that would have been returned | ||||
|     /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit) | ||||
|     /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so. | ||||
|     /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated | ||||
|     ///                      as plaintext. Does not insert a leading space. | ||||
|   | ||||
| @@ -3074,6 +3074,11 @@ int32_t llama_vocab::tokenize( | ||||
|                         bool   add_special, | ||||
|                         bool   parse_special) const { | ||||
|     auto res = tokenize(std::string(text, text_len), add_special, parse_special); | ||||
|     if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) { | ||||
|         LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size()); | ||||
|         return std::numeric_limits<int32_t>::min(); | ||||
|     } | ||||
|      | ||||
|     if (n_tokens_max < (int) res.size()) { | ||||
|         // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); | ||||
|         return -((int) res.size()); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Ruikai Peng
					Ruikai Peng