mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	If first token generated from the server is the stop word the server will crash (#7038)
This will reproduce the issue in llama13b
{
'prompt': 'Q: hello world \nA: ',
 'stop': ['\n'],
 'temperature': 0.0,
 'n_predict': 10,
 'cache_prompt': True,
 'n_probs': 10
}
			
			
This commit is contained in:
		@@ -1383,9 +1383,10 @@ struct server_context {
 | 
			
		||||
            if (!slot.params.stream && slot.stopped_word) {
 | 
			
		||||
                const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
 | 
			
		||||
 | 
			
		||||
                size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
 | 
			
		||||
                probs = std::vector<completion_token_output>(
 | 
			
		||||
                        slot.generated_token_probs.begin(),
 | 
			
		||||
                        slot.generated_token_probs.end() - stop_word_toks.size());
 | 
			
		||||
                        slot.generated_token_probs.end() - safe_offset);
 | 
			
		||||
            } else {
 | 
			
		||||
                probs = std::vector<completion_token_output>(
 | 
			
		||||
                        slot.generated_token_probs.begin(),
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user