mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : allow raw byte in SPM vocabs; don't crash on nl 404 (#5478)
* common : don't crash if newline token is not found * common : llama_byte_to_token: allow falling back to finding just the token byte in SPM vocabs
This commit is contained in:
		
							
								
								
									
										15
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -3314,7 +3314,12 @@ static void llm_load_vocab( | ||||
|  | ||||
|     // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' | ||||
|     if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { | ||||
|         vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); | ||||
|         try { | ||||
|             vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); | ||||
|         } catch (const std::exception & e) { | ||||
|             LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what()); | ||||
|             vocab.linefeed_id = vocab.special_pad_id; | ||||
|         } | ||||
|     } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { | ||||
|         vocab.linefeed_id = vocab.special_pad_id; | ||||
|     } else { | ||||
| @@ -7746,7 +7751,13 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { | ||||
|     switch (llama_vocab_get_type(vocab)) { | ||||
|         case LLAMA_VOCAB_TYPE_SPM: { | ||||
|             const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 }; | ||||
|             return vocab.token_to_id.at(buf); | ||||
|             auto token = vocab.token_to_id.find(buf); | ||||
|             if (token != vocab.token_to_id.end()) { | ||||
|                 return (*token).second; | ||||
|             } | ||||
|             // Try to fall back to just the byte as a string | ||||
|             const char buf2[2] = { (char)ch, 0 }; | ||||
|             return vocab.token_to_id.at(buf2); | ||||
|         } | ||||
|         case LLAMA_VOCAB_TYPE_WPM: | ||||
|         case LLAMA_VOCAB_TYPE_BPE: { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Aarni Koskela
					Aarni Koskela