mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	server : support jinja extra template kwargs (Qwen3 enable_thinking feature), from command line and from client (#13196)
* initial commit for handling extra template kwargs * enable_thinking and assistant prefill cannot be enabled at the same time * can set chat_template_kwargs in command line * added doc * fixed formatting * add support for extra context in generic template init * coding standard: common/chat.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * coding standard: common/chat.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Apply suggestions from code review coding standard: cosmetic changes Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix merge conflict * chat.cpp: simplify calls to apply to ensure systematic propagation of extra_context (+ the odd existing additional_context) * normalize environment variable name * simplify code * prefill cannot be used with thinking models * compatibility with the new reasoning-budget parameter * fix prefill for non thinking models --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Olivier Chafik <olivier.chafik@gmail.com>
This commit is contained in:
		| @@ -2794,6 +2794,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex | |||||||
|             params.ssl_file_cert = value; |             params.ssl_file_cert = value; | ||||||
|         } |         } | ||||||
|     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); |     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); | ||||||
|  |     add_opt(common_arg( | ||||||
|  |         {"--chat-template-kwargs"}, "STRING", | ||||||
|  |         string_format("sets additional params for the json template parser"), | ||||||
|  |         [](common_params & params, const std::string &  value) { | ||||||
|  |             auto parsed = json::parse(value); | ||||||
|  |             for (const auto & item : parsed.items()) { | ||||||
|  |                 params.default_template_kwargs[item.key()] = item.value().dump(); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS")); | ||||||
|     add_opt(common_arg( |     add_opt(common_arg( | ||||||
|         {"-to", "--timeout"}, "N", |         {"-to", "--timeout"}, "N", | ||||||
|         string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), |         string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), | ||||||
|   | |||||||
| @@ -17,6 +17,8 @@ | |||||||
| #include <string> | #include <string> | ||||||
| #include <vector> | #include <vector> | ||||||
|  |  | ||||||
|  | using json = nlohmann::ordered_json; | ||||||
|  |  | ||||||
| static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) { | static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) { | ||||||
|     auto time = std::chrono::system_clock::to_time_t(now); |     auto time = std::chrono::system_clock::to_time_t(now); | ||||||
|     auto local_time = *std::localtime(&time); |     auto local_time = *std::localtime(&time); | ||||||
| @@ -140,6 +142,7 @@ struct templates_params { | |||||||
|     bool add_generation_prompt = true; |     bool add_generation_prompt = true; | ||||||
|     bool enable_thinking = true; |     bool enable_thinking = true; | ||||||
|     std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); |     std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); | ||||||
|  |     json extra_context; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { | common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { | ||||||
| @@ -720,16 +723,23 @@ static void foreach_function(const json & tools, const std::function<void(const | |||||||
|  |  | ||||||
| static std::string apply( | static std::string apply( | ||||||
|     const common_chat_template & tmpl, |     const common_chat_template & tmpl, | ||||||
|     const nlohmann::ordered_json & messages, |     const struct templates_params & inputs, | ||||||
|     const nlohmann::ordered_json & tools, |     const std::optional<json> & messages_override = std::nullopt, | ||||||
|     bool add_generation_prompt, |     const std::optional<json> & tools_override = std::nullopt, | ||||||
|     const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) |     const std::optional<json> & additional_context = std::nullopt) | ||||||
| { | { | ||||||
|     minja::chat_template_inputs tmpl_inputs; |     minja::chat_template_inputs tmpl_inputs; | ||||||
|     tmpl_inputs.messages = messages; |     tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages; | ||||||
|     tmpl_inputs.tools = tools; |     if (tools_override) { | ||||||
|     tmpl_inputs.add_generation_prompt = add_generation_prompt; |         tmpl_inputs.tools = *tools_override; | ||||||
|     tmpl_inputs.extra_context = extra_context; |     } else { | ||||||
|  |         tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools; | ||||||
|  |     } | ||||||
|  |     tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt; | ||||||
|  |     tmpl_inputs.extra_context = inputs.extra_context; | ||||||
|  |     if (additional_context) { | ||||||
|  |         tmpl_inputs.extra_context.merge_patch(*additional_context); | ||||||
|  |     } | ||||||
|     // TODO: add flag to control date/time, if only for testing purposes. |     // TODO: add flag to control date/time, if only for testing purposes. | ||||||
|     // tmpl_inputs.now = std::chrono::system_clock::now(); |     // tmpl_inputs.now = std::chrono::system_clock::now(); | ||||||
|  |  | ||||||
| @@ -828,7 +838,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp | |||||||
|         inputs.messages, |         inputs.messages, | ||||||
|         "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request"); |         "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request"); | ||||||
|  |  | ||||||
|     data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); |     data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages); | ||||||
|     data.format = COMMON_CHAT_FORMAT_GENERIC; |     data.format = COMMON_CHAT_FORMAT_GENERIC; | ||||||
|     return data; |     return data; | ||||||
| } | } | ||||||
| @@ -904,7 +914,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat | |||||||
|     data.preserved_tokens = { |     data.preserved_tokens = { | ||||||
|         "[TOOL_CALLS]", |         "[TOOL_CALLS]", | ||||||
|     }; |     }; | ||||||
|     data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); |     data.prompt = apply(tmpl, inputs); | ||||||
|     data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO; |     data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO; | ||||||
|     return data; |     return data; | ||||||
| } | } | ||||||
| @@ -934,7 +944,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ | |||||||
|             adjusted_messages.push_back(msg); |             adjusted_messages.push_back(msg); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {}); |     data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages); | ||||||
|     data.format = COMMON_CHAT_FORMAT_COMMAND_R7B; |     data.format = COMMON_CHAT_FORMAT_COMMAND_R7B; | ||||||
|     if (string_ends_with(data.prompt, "<|START_THINKING|>")) { |     if (string_ends_with(data.prompt, "<|START_THINKING|>")) { | ||||||
|         if (!inputs.enable_thinking) { |         if (!inputs.enable_thinking) { | ||||||
| @@ -1122,7 +1132,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te | |||||||
|     } else { |     } else { | ||||||
|         data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; |         data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; | ||||||
|     } |     } | ||||||
|     data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, { |     data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json { | ||||||
|         {"date_string", format_time(inputs.now, "%d %b %Y")}, |         {"date_string", format_time(inputs.now, "%d %b %Y")}, | ||||||
|         {"tools_in_user_message", false}, |         {"tools_in_user_message", false}, | ||||||
|         {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools}, |         {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools}, | ||||||
| @@ -1187,7 +1197,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w | |||||||
|  |  | ||||||
| static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) { | static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) { | ||||||
|     common_chat_params data; |     common_chat_params data; | ||||||
|     auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); |     auto prompt = apply(tmpl, inputs); | ||||||
|  |  | ||||||
|     // Hacks to fix the official (broken) prompt. |     // Hacks to fix the official (broken) prompt. | ||||||
|     // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead, |     // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead, | ||||||
| @@ -1282,7 +1292,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) { | |||||||
| static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) { | static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) { | ||||||
|     LOG_DBG("%s\n", __func__); |     LOG_DBG("%s\n", __func__); | ||||||
|     common_chat_params data; |     common_chat_params data; | ||||||
|     data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, { |     data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json { | ||||||
|         {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")}, |         {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")}, | ||||||
|         {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))}, |         {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))}, | ||||||
|     }); |     }); | ||||||
| @@ -1338,7 +1348,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_ | |||||||
|     // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar |     // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar | ||||||
|     // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code. |     // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code. | ||||||
|     common_chat_params data; |     common_chat_params data; | ||||||
|     data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); |     data.prompt = apply(tmpl, inputs); | ||||||
|     data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2; |     data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2; | ||||||
|     if (inputs.tools.is_array() && !inputs.tools.empty()) { |     if (inputs.tools.is_array() && !inputs.tools.empty()) { | ||||||
|         data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; |         data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; | ||||||
| @@ -1465,7 +1475,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con | |||||||
|         data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; |         data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); |     data.prompt = apply(tmpl, inputs); | ||||||
|     // TODO: if (has_raw_python) |     // TODO: if (has_raw_python) | ||||||
|     return data; |     return data; | ||||||
| } | } | ||||||
| @@ -1498,14 +1508,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser | |||||||
| static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { | static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { | ||||||
|     common_chat_params data; |     common_chat_params data; | ||||||
|  |  | ||||||
|     json additional_context = { |     json extra_context = json { | ||||||
|         {"enable_thinking", inputs.enable_thinking}, |         {"enable_thinking", inputs.enable_thinking}, | ||||||
|     }; |     }; | ||||||
|  |     extra_context.update(inputs.extra_context); | ||||||
|  |  | ||||||
|     data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context); |     data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context); | ||||||
|     data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; |     data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; | ||||||
|     if (string_ends_with(data.prompt, "<think>\n")) { |     if (string_ends_with(data.prompt, "<think>\n")) { | ||||||
|         if (!inputs.enable_thinking) { |         if (!extra_context["enable_thinking"]) { | ||||||
|             data.prompt += "</think>"; |             data.prompt += "</think>"; | ||||||
|         } else { |         } else { | ||||||
|             data.thinking_forced_open = true; |             data.thinking_forced_open = true; | ||||||
| @@ -1691,7 +1702,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) { | |||||||
|  |  | ||||||
| static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { | static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { | ||||||
|     common_chat_params data; |     common_chat_params data; | ||||||
|     data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); |     data.prompt = apply(tmpl, inputs); | ||||||
|     data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; |     data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; | ||||||
|     data.grammar_lazy = false; |     data.grammar_lazy = false; | ||||||
|     if (!inputs.json_schema.is_null()) { |     if (!inputs.json_schema.is_null()) { | ||||||
| @@ -1722,6 +1733,12 @@ static common_chat_params common_chat_templates_apply_jinja( | |||||||
|     params.enable_thinking = inputs.enable_thinking; |     params.enable_thinking = inputs.enable_thinking; | ||||||
|     params.grammar = inputs.grammar; |     params.grammar = inputs.grammar; | ||||||
|     params.now = inputs.now; |     params.now = inputs.now; | ||||||
|  |  | ||||||
|  |     params.extra_context = json::object(); | ||||||
|  |     for (auto el : inputs.chat_template_kwargs) { | ||||||
|  |         params.extra_context[el.first] = json::parse(el.second); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     if (!inputs.json_schema.empty()) { |     if (!inputs.json_schema.empty()) { | ||||||
|         params.json_schema = json::parse(inputs.json_schema); |         params.json_schema = json::parse(inputs.json_schema); | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -7,6 +7,7 @@ | |||||||
| #include <chrono> | #include <chrono> | ||||||
| #include <string> | #include <string> | ||||||
| #include <vector> | #include <vector> | ||||||
|  | #include <map> | ||||||
|  |  | ||||||
| struct common_chat_templates; | struct common_chat_templates; | ||||||
|  |  | ||||||
| @@ -125,6 +126,7 @@ struct common_chat_templates_inputs { | |||||||
|     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; |     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; | ||||||
|     bool enable_thinking = true; |     bool enable_thinking = true; | ||||||
|     std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); |     std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); | ||||||
|  |     std::map<std::string, std::string> chat_template_kwargs; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| struct common_chat_params { | struct common_chat_params { | ||||||
|   | |||||||
| @@ -8,6 +8,7 @@ | |||||||
| #include <string> | #include <string> | ||||||
| #include <string_view> | #include <string_view> | ||||||
| #include <vector> | #include <vector> | ||||||
|  | #include <map> | ||||||
| #include <sstream> | #include <sstream> | ||||||
|  |  | ||||||
| #ifdef _WIN32 | #ifdef _WIN32 | ||||||
| @@ -381,6 +382,8 @@ struct common_params { | |||||||
|     std::string ssl_file_key  = "";                                                                         // NOLINT |     std::string ssl_file_key  = "";                                                                         // NOLINT | ||||||
|     std::string ssl_file_cert = "";                                                                         // NOLINT |     std::string ssl_file_cert = "";                                                                         // NOLINT | ||||||
|  |  | ||||||
|  |     std::map<std::string, std::string> default_template_kwargs; | ||||||
|  |  | ||||||
|     // "advanced" endpoints are disabled by default for better security |     // "advanced" endpoints are disabled by default for better security | ||||||
|     bool webui            = true; |     bool webui            = true; | ||||||
|     bool endpoint_slots   = false; |     bool endpoint_slots   = false; | ||||||
|   | |||||||
| @@ -164,6 +164,7 @@ The project is under active development, and we are [looking for feedback and co | |||||||
| | `--api-key-file FNAME` | path to file containing API keys (default: none) | | | `--api-key-file FNAME` | path to file containing API keys (default: none) | | ||||||
| | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) | | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) | | ||||||
| | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) | | | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) | | ||||||
|  | | `--chat-template-kwargs STRING` | JSON object containing additional params for the json template parser. Example: `--chat_template_kwargs "{\"enable_thinking\":false}`"<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) | | ||||||
| | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) | | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) | | ||||||
| | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) | | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) | | ||||||
| | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) | | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) | | ||||||
| @@ -1118,6 +1119,8 @@ See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs | |||||||
|  |  | ||||||
| The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers. | The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name",  "type": "string" }, "date": { "title": "Date",  "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants",  "type": "string" } } } }`), similar to other OpenAI-inspired API providers. | ||||||
|  |  | ||||||
|  | `chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}` | ||||||
|  |  | ||||||
| *Examples:* | *Examples:* | ||||||
|  |  | ||||||
| You can use either Python `openai` library with appropriate checkpoints: | You can use either Python `openai` library with appropriate checkpoints: | ||||||
|   | |||||||
| @@ -2110,6 +2110,7 @@ struct server_context { | |||||||
|             /* use_jinja             */ params_base.use_jinja, |             /* use_jinja             */ params_base.use_jinja, | ||||||
|             /* prefill_assistant     */ params_base.prefill_assistant, |             /* prefill_assistant     */ params_base.prefill_assistant, | ||||||
|             /* reasoning_format      */ params_base.reasoning_format, |             /* reasoning_format      */ params_base.reasoning_format, | ||||||
|  |             /* chat_template_kwargs  */ params_base.default_template_kwargs, | ||||||
|             /* common_chat_templates */ chat_templates.get(), |             /* common_chat_templates */ chat_templates.get(), | ||||||
|             /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false, |             /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false, | ||||||
|             /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false, |             /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false, | ||||||
|   | |||||||
| @@ -579,6 +579,7 @@ struct oaicompat_parser_options { | |||||||
|     bool use_jinja; |     bool use_jinja; | ||||||
|     bool prefill_assistant; |     bool prefill_assistant; | ||||||
|     common_reasoning_format reasoning_format; |     common_reasoning_format reasoning_format; | ||||||
|  |     std::map<std::string,std::string> chat_template_kwargs; | ||||||
|     common_chat_templates * tmpls; |     common_chat_templates * tmpls; | ||||||
|     bool allow_image; |     bool allow_image; | ||||||
|     bool allow_audio; |     bool allow_audio; | ||||||
| @@ -756,6 +757,13 @@ static json oaicompat_chat_params_parse( | |||||||
|         llama_params["parse_tool_calls"] = true; |         llama_params["parse_tool_calls"] = true; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     // merge the template args provided from command line with the args provided in the user request | ||||||
|  |     auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object()); | ||||||
|  |     inputs.chat_template_kwargs = opt.chat_template_kwargs; | ||||||
|  |     for (const auto & item : chat_template_kwargs_object.items()) { | ||||||
|  |         inputs.chat_template_kwargs[item.key()] = item.value().dump(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     // if the assistant message appears at the end of list, we do not add end-of-turn token |     // if the assistant message appears at the end of list, we do not add end-of-turn token | ||||||
|     // for ex. this can be useful to modify the reasoning process in reasoning models |     // for ex. this can be useful to modify the reasoning process in reasoning models | ||||||
|     bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant; |     bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant; | ||||||
| @@ -771,6 +779,11 @@ static json oaicompat_chat_params_parse( | |||||||
|  |  | ||||||
|         /* TODO: test this properly */ |         /* TODO: test this properly */ | ||||||
|         inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; |         inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; | ||||||
|  |  | ||||||
|  |         if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) { | ||||||
|  |             throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking."); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         inputs.add_generation_prompt = true; |         inputs.add_generation_prompt = true; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 matteo
					matteo