From 12bbc3fa50b6df03318a4451c9a2210200a0b28d Mon Sep 17 00:00:00 2001 From: Pascal Date: Wed, 8 Oct 2025 22:18:41 +0200 Subject: [PATCH] refactor: centralize CoT parsing in backend for streaming mode (#16394) * refactor: unify reasoning handling via backend reasoning_content, drop frontend tag parsing - Updated the chat message component to surface backend-supplied reasoning via message.thinking while showing the raw assistant content without inline tag scrubbing - Simplified chat streaming to append content chunks directly, stream reasoning into the message model, and persist any partial reasoning when generation stops - Refactored the chat service SSE handler to rely on server-provided reasoning_content, removing legacy parsing logic - Refreshed Storybook data and streaming flows to populate the thinking field explicitly for static and streaming assistant messages * refactor: implement streaming-aware universal reasoning parser Remove the streaming mode limitation from --reasoning-format by refactoring try_parse_reasoning() to handle incremental parsing of tags across all formats. - Rework try_parse_reasoning() to track whitespace, partial tags, and multiple reasoning segments, allowing proper separation of reasoning_content and content in streaming mode - Parse reasoning tags before tool call handling in content-only and Llama 3.x formats to ensure inline blocks are captured correctly - Change default reasoning_format from 'auto' to 'deepseek' for consistent behavior - Add 'deepseek-legacy' option to preserve old inline behavior when needed - Update CLI help and documentation to reflect streaming support - Add parser tests for inline ... segments The parser now continues processing content after closes instead of stopping, enabling proper message.reasoning_content and message.content separation in both streaming and non-streaming modes. Fixes the issue where streaming responses would dump everything (including post-thinking content) into reasoning_content while leaving content empty. * refactor: address review feedback from allozaur - Passed the assistant message content directly to ChatMessageAssistant to drop the redundant derived state in the chat message component - Simplified chat streaming updates by removing unused partial-thinking handling and persisting partial responses straight from currentResponse - Refreshed the ChatMessage stories to cover standard and reasoning scenarios without the old THINK-tag parsing examples Co-authored-by: Aleksander Grygier * refactor: restore forced reasoning prefix to pass test-chat ([chat] All tests passed) - store the exact sequence seen on input when 'thinking_forced_open' enforces a reasoning block - inject this prefix before the first accumulated segment in 'reasoning_content', then clear it to avoid duplication - repeat the capture on every new 'start_think' detection to properly handle partial/streaming flows * refactor: address review feedback from ngxson * debug: say goodbye to curl -N, hello one-click raw stream - adds a new checkbox in the WebUI to display raw LLM output without backend parsing or frontend Markdown rendering * Update tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte Co-authored-by: Aleksander Grygier * webui: add Storybook example for raw LLM output and scope reasoning format toggle per story - Added a Storybook example that showcases the chat message component in raw LLM output mode with the provided trace sample - Updated every ChatMessage story to toggle the disableReasoningFormat setting so the raw-output rendering remains scoped to its own example * npm run format * chat-parser: address review feedback from ngxson Co-authored-by: Xuan Son Nguyen --------- Co-authored-by: Aleksander Grygier Co-authored-by: Xuan Son Nguyen --- common/arg.cpp | 3 +- common/chat-parser.cpp | 138 +++++++++- common/chat.cpp | 3 + common/common.h | 2 +- tests/test-chat-parser.cpp | 28 +++ tools/server/README.md | 2 +- .../app/chat/ChatMessages/ChatMessage.svelte | 20 +- .../ChatMessages/ChatMessageAssistant.svelte | 23 +- .../ChatSettings/ChatSettingsDialog.svelte | 6 + .../src/lib/constants/settings-config.ts | 3 + tools/server/webui/src/lib/services/chat.ts | 83 +------ .../webui/src/lib/stores/chat.svelte.ts | 18 +- tools/server/webui/src/lib/utils/thinking.ts | 143 ----------- .../src/stories/ChatMessage.stories.svelte | 235 +++++------------- 14 files changed, 276 insertions(+), 431 deletions(-) delete mode 100644 tools/server/webui/src/lib/utils/thinking.ts diff --git a/common/arg.cpp b/common/arg.cpp index ecc296485c..4204f6c690 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3432,7 +3432,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--reasoning-format"}, "FORMAT", "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" "- none: leaves thoughts unparsed in `message.content`\n" - "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n" + "- deepseek: puts thoughts in `message.reasoning_content`\n" + "- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`\n" "(default: auto)", [](common_params & params, const std::string & value) { params.reasoning_format = common_reasoning_format_from_name(value); diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp index b3362519a6..7365782e7d 100644 --- a/common/chat-parser.cpp +++ b/common/chat-parser.cpp @@ -3,9 +3,12 @@ #include "log.h" #include "regex-partial.h" +#include +#include #include #include #include +#include #include using json = nlohmann::ordered_json; @@ -166,6 +169,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) { } bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) { + std::string pending_reasoning_prefix; + + if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) { + return false; + } + + auto set_reasoning_prefix = [&](size_t prefix_pos) { + if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) { + return; + } + if (prefix_pos + start_think.size() > input_.size()) { + pending_reasoning_prefix.clear(); + return; + } + // Capture the exact literal that opened the reasoning section so we can + // surface it back to callers. This ensures formats that force the + // reasoning tag open (e.g. DeepSeek R1) retain their original prefix + // instead of dropping it during parsing. + pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size()); + }; + auto handle_reasoning = [&](const std::string & reasoning, bool closed) { auto stripped_reasoning = string_strip(reasoning); if (stripped_reasoning.empty()) { @@ -178,28 +202,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "" : end_think); } } else { + if (!pending_reasoning_prefix.empty()) { + add_reasoning_content(pending_reasoning_prefix); + pending_reasoning_prefix.clear(); + } add_reasoning_content(stripped_reasoning); } }; - if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) { - if (syntax_.thinking_forced_open || try_consume_literal(start_think)) { - if (auto res = try_find_literal(end_think)) { - handle_reasoning(res->prelude, /* closed */ true); - consume_spaces(); - return true; - } - auto rest = consume_rest(); + + const size_t saved_pos = pos_; + const size_t saved_content_size = result_.content.size(); + const size_t saved_reasoning_size = result_.reasoning_content.size(); + + auto restore_state = [&]() { + move_to(saved_pos); + result_.content.resize(saved_content_size); + result_.reasoning_content.resize(saved_reasoning_size); + }; + + // Allow leading whitespace to be preserved as content when reasoning is present at the start + size_t cursor = pos_; + size_t whitespace_end = cursor; + while (whitespace_end < input_.size() && std::isspace(static_cast(input_[whitespace_end]))) { + ++whitespace_end; + } + + if (whitespace_end >= input_.size()) { + restore_state(); + if (syntax_.thinking_forced_open) { + auto rest = input_.substr(saved_pos); if (!rest.empty()) { handle_reasoning(rest, /* closed */ !is_partial()); } - // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877) - // if (!syntax_.thinking_forced_open) { - // throw common_chat_msg_partial_exception(end_think); - // } + move_to(input_.size()); return true; } + return false; + } + + cursor = whitespace_end; + const size_t remaining = input_.size() - cursor; + const size_t start_prefix = std::min(start_think.size(), remaining); + const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0; + + if (has_start_tag && start_prefix < start_think.size()) { + move_to(input_.size()); + return true; + } + + if (has_start_tag) { + if (whitespace_end > pos_) { + add_content(input_.substr(pos_, whitespace_end - pos_)); + } + set_reasoning_prefix(cursor); + cursor += start_think.size(); + } else if (syntax_.thinking_forced_open) { + cursor = whitespace_end; + } else { + restore_state(); + return false; + } + while (true) { + if (cursor >= input_.size()) { + move_to(input_.size()); + return true; + } + + size_t end_pos = input_.find(end_think, cursor); + if (end_pos == std::string::npos) { + std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor); + size_t partial_off = string_find_partial_stop(remaining_view, end_think); + size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off; + if (reasoning_end > cursor) { + handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial()); + } + move_to(input_.size()); + return true; + } + + if (end_pos > cursor) { + handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true); + } else { + handle_reasoning("", /* closed */ true); + } + + cursor = end_pos + end_think.size(); + + while (cursor < input_.size() && std::isspace(static_cast(input_[cursor]))) { + ++cursor; + } + + const size_t next_remaining = input_.size() - cursor; + if (next_remaining == 0) { + move_to(cursor); + return true; + } + + const size_t next_prefix = std::min(start_think.size(), next_remaining); + if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) { + if (next_prefix < start_think.size()) { + move_to(input_.size()); + return true; + } + set_reasoning_prefix(cursor); + cursor += start_think.size(); + continue; + } + + move_to(cursor); + return true; } - return false; } std::string common_chat_msg_parser::consume_rest() { diff --git a/common/chat.cpp b/common/chat.cpp index afbb2a2bdd..8587140e1f 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1408,6 +1408,8 @@ static common_chat_params common_chat_params_init_apertus(const common_chat_temp return data; } static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) { + builder.try_parse_reasoning("", ""); + if (!builder.syntax().parse_tool_calls) { builder.add_content(builder.consume_rest()); return; @@ -2862,6 +2864,7 @@ common_chat_params common_chat_templates_apply( } static void common_chat_parse_content_only(common_chat_msg_parser & builder) { + builder.try_parse_reasoning("", ""); builder.add_content(builder.consume_rest()); } diff --git a/common/common.h b/common/common.h index 8a8ecd667f..0d3638c9c6 100644 --- a/common/common.h +++ b/common/common.h @@ -433,7 +433,7 @@ struct common_params { std::string chat_template = ""; // NOLINT bool use_jinja = false; // NOLINT bool enable_chat_template = true; - common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO; + common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; int reasoning_budget = -1; bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp index 547ebb4871..0b275befb8 100644 --- a/tests/test-chat-parser.cpp +++ b/tests/test-chat-parser.cpp @@ -106,6 +106,34 @@ static void test_reasoning() { assert_equals("Cogito", builder.result().content); assert_equals("Ergo sum", builder.consume_rest()); } + { + const std::string variant("content_only_inline_think"); + common_chat_syntax syntax = { + /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ false, + }; + const std::string input = "PenseBonjour"; + auto msg = common_chat_parse(input, false, syntax); + assert_equals(variant, std::string("Pense"), msg.reasoning_content); + assert_equals(variant, std::string("Bonjour"), msg.content); + } + { + const std::string variant("llama_3_inline_think"); + common_chat_syntax syntax = { + /* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, + /* .reasoning_in_content = */ false, + /* .thinking_forced_open = */ false, + /* .parse_tool_calls = */ false, + }; + const std::string input = "PlanRéponse"; + auto msg = common_chat_parse(input, false, syntax); + assert_equals(variant, std::string("Plan"), msg.reasoning_content); + assert_equals(variant, std::string("Réponse"), msg.content); + } // Test DeepSeek V3.1 parsing - reasoning content followed by "" and then regular content { common_chat_syntax syntax = { diff --git a/tools/server/README.md b/tools/server/README.md index e23b122ab1..f5ab9236d5 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co | `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | -| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)
(default: auto)
(env: LLAMA_ARG_THINK) | +| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: deepseek)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | diff --git a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte index c923bf9e04..fed0cf7126 100644 --- a/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte +++ b/tools/server/webui/src/lib/components/app/chat/ChatMessages/ChatMessage.svelte @@ -1,7 +1,6 @@ { + const { updateConfig } = await import('$lib/stores/settings.svelte'); + updateConfig('disableReasoningFormat', false); + }} /> { + const { updateConfig } = await import('$lib/stores/settings.svelte'); + updateConfig('disableReasoningFormat', false); + }} /> { + const { updateConfig } = await import('$lib/stores/settings.svelte'); + updateConfig('disableReasoningFormat', false); + }} +/> + + { + const { updateConfig } = await import('$lib/stores/settings.svelte'); + updateConfig('disableReasoningFormat', true); + }} +/> + + { + const { updateConfig } = await import('$lib/stores/settings.svelte'); + updateConfig('disableReasoningFormat', false); // Phase 1: Stream reasoning content in chunks let reasoningText = 'I need to think about this carefully. Let me break down the problem:\n\n1. The user is asking for help with something complex\n2. I should provide a thorough and helpful response\n3. I need to consider multiple approaches\n4. The best solution would be to explain step by step\n\nThis approach will ensure clarity and understanding.'; @@ -187,126 +192,16 @@ message: processingMessage }} play={async () => { + const { updateConfig } = await import('$lib/stores/settings.svelte'); + updateConfig('disableReasoningFormat', false); // Import the chat store to simulate loading state const { chatStore } = await import('$lib/stores/chat.svelte'); - + // Set loading state to true to trigger the processing UI chatStore.isLoading = true; - + // Simulate the processing state hook behavior // This will show the "Generating..." text and parameter details - await new Promise(resolve => setTimeout(resolve, 100)); + await new Promise((resolve) => setTimeout(resolve, 100)); }} /> - - - - - - { - // Phase 1: Stream reasoning content - const thinkingContent = - 'Let me work through this problem systematically:\n\n1. First, I need to understand what the user is asking\n2. Then I should consider different approaches\n3. I need to evaluate the pros and cons\n4. Finally, I should provide a clear recommendation\n\nThis step-by-step approach will ensure accuracy.'; - - let currentContent = '\n'; - streamingThinkMessage.content = currentContent; - - for (let i = 0; i < thinkingContent.length; i++) { - currentContent += thinkingContent[i]; - streamingThinkMessage.content = currentContent; - await new Promise((resolve) => setTimeout(resolve, 5)); - } - - // Close the thinking block - currentContent += '\n\n\n'; - streamingThinkMessage.content = currentContent; - await new Promise((resolve) => setTimeout(resolve, 200)); - - // Phase 2: Stream main response content - const responseContent = - "Based on my analysis above, here's the solution:\n\n**Key Points:**\n- The approach should be systematic\n- We need to consider all factors\n- Implementation should be step-by-step\n\nThis ensures the best possible outcome."; - - for (let i = 0; i < responseContent.length; i++) { - currentContent += responseContent[i]; - streamingThinkMessage.content = currentContent; - await new Promise((resolve) => setTimeout(resolve, 10)); - } - - streamingThinkMessage.timestamp = Date.now(); - }} -> -
- -
-
- - { - // Phase 1: Stream [THINK] reasoning content - const thinkingContent = - 'Using the DeepSeek format now:\n\n- This demonstrates the [THINK] bracket format\n- Should parse identically to <think> tags\n- The UI should display this in the thinking section\n- Main content should be separate\n\nBoth formats provide the same functionality.'; - - let currentContent = '[THINK]\n'; - streamingBracketMessage.content = currentContent; - - for (let i = 0; i < thinkingContent.length; i++) { - currentContent += thinkingContent[i]; - streamingBracketMessage.content = currentContent; - await new Promise((resolve) => setTimeout(resolve, 5)); - } - - // Close the thinking block - currentContent += '\n[/THINK]\n\n'; - streamingBracketMessage.content = currentContent; - await new Promise((resolve) => setTimeout(resolve, 200)); - - // Phase 2: Stream main response content - const responseContent = - "Here's my response after using the [THINK] format:\n\n**Observations:**\n- Both <think> and [THINK] formats work seamlessly\n- The parsing logic handles both cases\n- UI display is consistent across formats\n\nThis demonstrates the enhanced thinking content support."; - - for (let i = 0; i < responseContent.length; i++) { - currentContent += responseContent[i]; - streamingBracketMessage.content = currentContent; - await new Promise((resolve) => setTimeout(resolve, 10)); - } - - streamingBracketMessage.timestamp = Date.now(); - }} -> -
- -
-