mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
server : include usage statistics only when user request them (#16052)
* server : include usage statistics only when user request them
When serving the OpenAI compatible API, we should check if
{"stream_options": {"include_usage": true} is set in the request when
deciding whether we should send usage statistics
closes: #16048
* add unit test
This commit is contained in:
committed by
GitHub
parent
e58174cecb
commit
2b6b55a59f
@@ -111,6 +111,7 @@ static bool server_task_type_need_logits(server_task_type task_type) {
|
||||
|
||||
struct slot_params {
|
||||
bool stream = true;
|
||||
bool include_usage = false;
|
||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||
bool return_tokens = false;
|
||||
bool return_progress = false;
|
||||
@@ -311,6 +312,8 @@ struct server_task {
|
||||
params.timings_per_token = json_value(data, "timings_per_token", false);
|
||||
|
||||
params.stream = json_value(data, "stream", false);
|
||||
auto stream_opt = json_value(data, "stream_options", json::object());
|
||||
params.include_usage = json_value(stream_opt, "include_usage", false);
|
||||
params.cache_prompt = json_value(data, "cache_prompt", true);
|
||||
params.return_tokens = json_value(data, "return_tokens", false);
|
||||
params.return_progress = json_value(data, "return_progress", false);
|
||||
@@ -775,6 +778,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
llama_tokens tokens;
|
||||
|
||||
bool stream;
|
||||
bool include_usage;
|
||||
result_timings timings;
|
||||
std::string prompt;
|
||||
|
||||
@@ -982,6 +986,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
{"object", "chat.completion.chunk"},
|
||||
});
|
||||
|
||||
if (include_usage) {
|
||||
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
|
||||
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
|
||||
deltas.push_back({
|
||||
@@ -997,6 +1002,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||
}},
|
||||
});
|
||||
}
|
||||
|
||||
if (timings.prompt_n >= 0) {
|
||||
deltas.back().push_back({"timings", timings.to_json()});
|
||||
@@ -2815,6 +2821,7 @@ struct server_context {
|
||||
|
||||
res->verbose = slot.params.verbose;
|
||||
res->stream = slot.params.stream;
|
||||
res->include_usage = slot.params.include_usage;
|
||||
res->oaicompat = slot.params.oaicompat;
|
||||
res->oaicompat_model = slot.params.oaicompat_model;
|
||||
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
||||
|
||||
@@ -271,8 +271,10 @@ def test_chat_completion_with_timings_per_token():
|
||||
"max_tokens": 10,
|
||||
"messages": [{"role": "user", "content": "test"}],
|
||||
"stream": True,
|
||||
"stream_options": {"include_usage": True},
|
||||
"timings_per_token": True,
|
||||
})
|
||||
stats_received = False
|
||||
for i, data in enumerate(res):
|
||||
if i == 0:
|
||||
# Check first role message for stream=True
|
||||
@@ -288,6 +290,8 @@ def test_chat_completion_with_timings_per_token():
|
||||
assert "predicted_per_second" in data["timings"]
|
||||
assert "predicted_n" in data["timings"]
|
||||
assert data["timings"]["predicted_n"] <= 10
|
||||
stats_received = True
|
||||
assert stats_received
|
||||
|
||||
|
||||
def test_logprobs():
|
||||
|
||||
Reference in New Issue
Block a user