server : include usage statistics only when user request them (#16052)

* server : include usage statistics only when user request them

When serving the OpenAI compatible API, we should check if
{"stream_options": {"include_usage": true} is set in the request when
deciding whether we should send usage statistics

closes: #16048

* add unit test
This commit is contained in:
Radoslav Gerganov
2025-09-18 13:36:57 +03:00
committed by GitHub
parent e58174cecb
commit 2b6b55a59f
2 changed files with 37 additions and 26 deletions

View File

@@ -111,6 +111,7 @@ static bool server_task_type_need_logits(server_task_type task_type) {
struct slot_params {
bool stream = true;
bool include_usage = false;
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
bool return_tokens = false;
bool return_progress = false;
@@ -311,6 +312,8 @@ struct server_task {
params.timings_per_token = json_value(data, "timings_per_token", false);
params.stream = json_value(data, "stream", false);
auto stream_opt = json_value(data, "stream_options", json::object());
params.include_usage = json_value(stream_opt, "include_usage", false);
params.cache_prompt = json_value(data, "cache_prompt", true);
params.return_tokens = json_value(data, "return_tokens", false);
params.return_progress = json_value(data, "return_progress", false);
@@ -775,6 +778,7 @@ struct server_task_result_cmpl_final : server_task_result {
llama_tokens tokens;
bool stream;
bool include_usage;
result_timings timings;
std::string prompt;
@@ -982,6 +986,7 @@ struct server_task_result_cmpl_final : server_task_result {
{"object", "chat.completion.chunk"},
});
if (include_usage) {
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
deltas.push_back({
@@ -997,6 +1002,7 @@ struct server_task_result_cmpl_final : server_task_result {
{"total_tokens", n_decoded + n_prompt_tokens},
}},
});
}
if (timings.prompt_n >= 0) {
deltas.back().push_back({"timings", timings.to_json()});
@@ -2815,6 +2821,7 @@ struct server_context {
res->verbose = slot.params.verbose;
res->stream = slot.params.stream;
res->include_usage = slot.params.include_usage;
res->oaicompat = slot.params.oaicompat;
res->oaicompat_model = slot.params.oaicompat_model;
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;

View File

@@ -271,8 +271,10 @@ def test_chat_completion_with_timings_per_token():
"max_tokens": 10,
"messages": [{"role": "user", "content": "test"}],
"stream": True,
"stream_options": {"include_usage": True},
"timings_per_token": True,
})
stats_received = False
for i, data in enumerate(res):
if i == 0:
# Check first role message for stream=True
@@ -288,6 +290,8 @@ def test_chat_completion_with_timings_per_token():
assert "predicted_per_second" in data["timings"]
assert "predicted_n" in data["timings"]
assert data["timings"]["predicted_n"] <= 10
stats_received = True
assert stats_received
def test_logprobs():