mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-27 08:21:30 +00:00 
			
		
		
		
	Thinking model disabled assistant prefill (#15404)
* feat: Set enable_thinking IFF not disabled and supported Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Fix inverted logic condition for prefill error Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Always parse the enable_thinking kwarg to overwrite the default value From what I can tell, this started as a Qwen3-specific keyword, but from the use in `chat.cpp` translates this inputs.enable_thinking to the right thinking kwarg for the given model, this is now more of a standardized kwarg, so it should always override the default value when sent as part of the chat_template_kwargs field in the API. Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Don't limit tempalte expansion check to jinja With the use_jinja check, non-jinja models would enable thinking and always fail assistant prefill Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add the error text to json type errors in json_value Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Explicitly reject string values for "enable_thinking" There are too many possible "truthy" / "falsy" strings and too many ambiguous strings that don't have a clear truthy/falsy value, so the simplest thing to do here is to reject the request. Ideally, this would be a 422 (Unprocessable Entity), but right now it's coming back as a 500. Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Move logic for detecting template enable_thinking support to common Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Use raw pointer for common chat template function Branch: gabe-l-hart/thinking-model-disabled-agent-prefill Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
This commit is contained in:
		| @@ -163,6 +163,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin | ||||
|     throw std::runtime_error("Invalid tool_choice: " + tool_choice); | ||||
| } | ||||
|  | ||||
| bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) { | ||||
|     common_chat_templates_inputs dummy_inputs; | ||||
|     common_chat_msg msg; | ||||
|     msg.role = "user"; | ||||
|     msg.content = "test"; | ||||
|     dummy_inputs.messages = {msg}; | ||||
|     dummy_inputs.enable_thinking = false; | ||||
|     const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs); | ||||
|     dummy_inputs.enable_thinking = true; | ||||
|     const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs); | ||||
|     return rendered_no_thinking.prompt != rendered_with_thinking.prompt; | ||||
| } | ||||
|  | ||||
| template <> | ||||
| std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) { | ||||
|     std::vector<common_chat_msg> msgs; | ||||
|   | ||||
| @@ -199,6 +199,8 @@ common_chat_msg           common_chat_parse(const std::string & input, bool is_p | ||||
|  | ||||
| common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice); | ||||
|  | ||||
| bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates); | ||||
|  | ||||
| // Parses a JSON array of messages in OpenAI's chat completion API format. | ||||
| // T can be std::string containing JSON or nlohmann::ordered_json | ||||
| template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages); | ||||
|   | ||||
| @@ -2267,6 +2267,12 @@ struct server_context { | ||||
|  | ||||
|         metrics.init(); | ||||
|  | ||||
|         // thinking is enabled if: | ||||
|         // 1. It's not explicitly disabled (reasoning_budget == 0) | ||||
|         // 2. The chat template supports it | ||||
|         const bool enable_thinking = params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get()); | ||||
|         SRV_INF("Enable thinking? %d\n", enable_thinking); | ||||
|  | ||||
|         oai_parser_opt = { | ||||
|             /* use_jinja             */ params_base.use_jinja, | ||||
|             /* prefill_assistant     */ params_base.prefill_assistant, | ||||
| @@ -2275,7 +2281,7 @@ struct server_context { | ||||
|             /* common_chat_templates */ chat_templates.get(), | ||||
|             /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false, | ||||
|             /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false, | ||||
|             /* enable_thinking       */ params_base.reasoning_budget != 0, | ||||
|             /* enable_thinking       */ enable_thinking, | ||||
|         }; | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -54,8 +54,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul | ||||
|     if (body.contains(key) && !body.at(key).is_null()) { | ||||
|         try { | ||||
|             return body.at(key); | ||||
|         } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) { | ||||
|             LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name()); | ||||
|         } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const & err) { | ||||
|             LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value: %s\n", key.c_str(), json(default_value).type_name(), err.what()); | ||||
|             return default_value; | ||||
|         } | ||||
|     } else { | ||||
| @@ -708,6 +708,16 @@ static json oaicompat_chat_params_parse( | ||||
|         inputs.chat_template_kwargs[item.key()] = item.value().dump(); | ||||
|     } | ||||
|  | ||||
|     // parse the "enable_thinking" kwarg to override the default value | ||||
|     auto enable_thinking_kwarg = json_value(inputs.chat_template_kwargs, "enable_thinking", std::string("")); | ||||
|     if (enable_thinking_kwarg == "true") { | ||||
|         inputs.enable_thinking = true; | ||||
|     } else if (enable_thinking_kwarg == "false") { | ||||
|         inputs.enable_thinking = false; | ||||
|     } else if (!enable_thinking_kwarg.empty() && enable_thinking_kwarg[0] == '"') { | ||||
|         throw std::runtime_error("invalid type for \"enable_thinking\" (expected boolean, got string)"); | ||||
|     } | ||||
|  | ||||
|     // if the assistant message appears at the end of list, we do not add end-of-turn token | ||||
|     // for ex. this can be useful to modify the reasoning process in reasoning models | ||||
|     bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant; | ||||
| @@ -724,7 +734,7 @@ static json oaicompat_chat_params_parse( | ||||
|         /* TODO: test this properly */ | ||||
|         inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; | ||||
|  | ||||
|         if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) { | ||||
|         if ( inputs.enable_thinking ) { | ||||
|             throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking."); | ||||
|         } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Gabe Goodhart
					Gabe Goodhart