server : include usage statistics only when user request them (#16052)

* server : include usage statistics only when user request them When serving the OpenAI compatible API, we should check if {"stream_options": {"include_usage": true} is set in the request when deciding whether we should send usage statistics closes: #16048 * add unit test
2025-10-27 08:21:30 +00:00 · 2025-09-18 13:36:57 +03:00
parent e58174cecb
commit 2b6b55a59f
2 changed files with 37 additions and 26 deletions
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -111,6 +111,7 @@ static bool server_task_type_need_logits(server_task_type task_type) {

 struct slot_params {
    bool stream          = true;
+    bool include_usage   = false;
    bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt
    bool return_tokens   = false;
    bool return_progress = false;
@@ -311,6 +312,8 @@ struct server_task {
        params.timings_per_token = json_value(data, "timings_per_token", false);

        params.stream           = json_value(data,       "stream",             false);
+        auto stream_opt         = json_value(data,       "stream_options",     json::object());
+        params.include_usage    = json_value(stream_opt, "include_usage",      false);
        params.cache_prompt     = json_value(data,       "cache_prompt",       true);
        params.return_tokens    = json_value(data,       "return_tokens",      false);
        params.return_progress  = json_value(data,       "return_progress",    false);
@@ -775,6 +778,7 @@ struct server_task_result_cmpl_final : server_task_result {
    llama_tokens tokens;

    bool stream;
+    bool include_usage;
    result_timings timings;
    std::string prompt;

@@ -982,6 +986,7 @@ struct server_task_result_cmpl_final : server_task_result {
            {"object",             "chat.completion.chunk"},
        });

+        if (include_usage) {
            // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
            // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
            deltas.push_back({
@@ -997,6 +1002,7 @@ struct server_task_result_cmpl_final : server_task_result {
                    {"total_tokens",      n_decoded + n_prompt_tokens},
                }},
            });
+        }

        if (timings.prompt_n >= 0) {
            deltas.back().push_back({"timings", timings.to_json()});
@@ -2815,6 +2821,7 @@ struct server_context {

        res->verbose               = slot.params.verbose;
        res->stream                = slot.params.stream;
+        res->include_usage         = slot.params.include_usage;
        res->oaicompat             = slot.params.oaicompat;
        res->oaicompat_model       = slot.params.oaicompat_model;
        res->oaicompat_cmpl_id     = slot.params.oaicompat_cmpl_id;
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -271,8 +271,10 @@ def test_chat_completion_with_timings_per_token():
        "max_tokens": 10,
        "messages": [{"role": "user", "content": "test"}],
        "stream": True,
+        "stream_options": {"include_usage": True},
        "timings_per_token": True,
    })
+    stats_received = False
    for i, data in enumerate(res):
        if i == 0:
            # Check first role message for stream=True
@@ -288,6 +290,8 @@ def test_chat_completion_with_timings_per_token():
                assert "predicted_per_second" in data["timings"]
                assert "predicted_n" in data["timings"]
                assert data["timings"]["predicted_n"] <= 10
+                stats_received = True
+    assert stats_received


 def test_logprobs():