mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	chat: handle gpt-oss return/end token inconsistency (#15421)
This commit addresses an inconsistency during inference by adding a new member to the `templates_params` struct to indicate whether the chat is in inference mode. This allows the gpt-oss specific function `common_chat_params_init_gpt_oss` to check this flag and the `add_generation_prompt` flag to determine if it should replace the `<|return|>` token with the `<|end|>` token in the prompt. The motivation for this change is to ensure that the formatted prompt of past messages in `common_chat_format_single` matches the output of the formatted new message. The issue is that the gpt-oss template returns different end tags: `<|return|>` when `add_generation_prompt` is false, and `<|end|>` when `add_generation_prompt` is true. This causes the substring function to start at an incorrect position, resulting in tokenization starting with 'tart|>' instead of '<|start|>'. Resolves: https://github.com/ggml-org/llama.cpp/issues/15417
This commit is contained in:
		| @@ -147,6 +147,7 @@ struct templates_params { | ||||
|     json extra_context; | ||||
|     bool add_bos; | ||||
|     bool add_eos; | ||||
|     bool is_inference = true; | ||||
| }; | ||||
|  | ||||
| common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { | ||||
| @@ -1336,6 +1337,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp | ||||
|     common_chat_params data; | ||||
|     auto prompt = apply(tmpl, inputs); | ||||
|  | ||||
|     // Check if we need to replace the return token with end token during | ||||
|     // inference and without generation prompt. For more details see: | ||||
|     // https://github.com/ggml-org/llama.cpp/issues/15417 | ||||
|     if (inputs.is_inference && !inputs.add_generation_prompt) { | ||||
|         static constexpr std::string_view return_token = "<|return|>"; | ||||
|         static constexpr std::string_view end_token    = "<|end|>"; | ||||
|         if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) { | ||||
|             prompt.replace(pos, return_token.length(), end_token); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     data.prompt = prompt; | ||||
|     data.format = COMMON_CHAT_FORMAT_GPT_OSS; | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Bevenius
					Daniel Bevenius