mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
server : include usage statistics only when user request them (#16052)
* server : include usage statistics only when user request them
When serving the OpenAI compatible API, we should check if
{"stream_options": {"include_usage": true} is set in the request when
deciding whether we should send usage statistics
closes: #16048
* add unit test
This commit is contained in:
committed by
GitHub
parent
e58174cecb
commit
2b6b55a59f
@@ -111,6 +111,7 @@ static bool server_task_type_need_logits(server_task_type task_type) {
|
|||||||
|
|
||||||
struct slot_params {
|
struct slot_params {
|
||||||
bool stream = true;
|
bool stream = true;
|
||||||
|
bool include_usage = false;
|
||||||
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
|
||||||
bool return_tokens = false;
|
bool return_tokens = false;
|
||||||
bool return_progress = false;
|
bool return_progress = false;
|
||||||
@@ -311,6 +312,8 @@ struct server_task {
|
|||||||
params.timings_per_token = json_value(data, "timings_per_token", false);
|
params.timings_per_token = json_value(data, "timings_per_token", false);
|
||||||
|
|
||||||
params.stream = json_value(data, "stream", false);
|
params.stream = json_value(data, "stream", false);
|
||||||
|
auto stream_opt = json_value(data, "stream_options", json::object());
|
||||||
|
params.include_usage = json_value(stream_opt, "include_usage", false);
|
||||||
params.cache_prompt = json_value(data, "cache_prompt", true);
|
params.cache_prompt = json_value(data, "cache_prompt", true);
|
||||||
params.return_tokens = json_value(data, "return_tokens", false);
|
params.return_tokens = json_value(data, "return_tokens", false);
|
||||||
params.return_progress = json_value(data, "return_progress", false);
|
params.return_progress = json_value(data, "return_progress", false);
|
||||||
@@ -775,6 +778,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
llama_tokens tokens;
|
llama_tokens tokens;
|
||||||
|
|
||||||
bool stream;
|
bool stream;
|
||||||
|
bool include_usage;
|
||||||
result_timings timings;
|
result_timings timings;
|
||||||
std::string prompt;
|
std::string prompt;
|
||||||
|
|
||||||
@@ -982,6 +986,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
{"object", "chat.completion.chunk"},
|
{"object", "chat.completion.chunk"},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (include_usage) {
|
||||||
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
|
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
|
||||||
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
|
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
|
||||||
deltas.push_back({
|
deltas.push_back({
|
||||||
@@ -997,6 +1002,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
|||||||
{"total_tokens", n_decoded + n_prompt_tokens},
|
{"total_tokens", n_decoded + n_prompt_tokens},
|
||||||
}},
|
}},
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (timings.prompt_n >= 0) {
|
if (timings.prompt_n >= 0) {
|
||||||
deltas.back().push_back({"timings", timings.to_json()});
|
deltas.back().push_back({"timings", timings.to_json()});
|
||||||
@@ -2815,6 +2821,7 @@ struct server_context {
|
|||||||
|
|
||||||
res->verbose = slot.params.verbose;
|
res->verbose = slot.params.verbose;
|
||||||
res->stream = slot.params.stream;
|
res->stream = slot.params.stream;
|
||||||
|
res->include_usage = slot.params.include_usage;
|
||||||
res->oaicompat = slot.params.oaicompat;
|
res->oaicompat = slot.params.oaicompat;
|
||||||
res->oaicompat_model = slot.params.oaicompat_model;
|
res->oaicompat_model = slot.params.oaicompat_model;
|
||||||
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
|
||||||
|
|||||||
@@ -271,8 +271,10 @@ def test_chat_completion_with_timings_per_token():
|
|||||||
"max_tokens": 10,
|
"max_tokens": 10,
|
||||||
"messages": [{"role": "user", "content": "test"}],
|
"messages": [{"role": "user", "content": "test"}],
|
||||||
"stream": True,
|
"stream": True,
|
||||||
|
"stream_options": {"include_usage": True},
|
||||||
"timings_per_token": True,
|
"timings_per_token": True,
|
||||||
})
|
})
|
||||||
|
stats_received = False
|
||||||
for i, data in enumerate(res):
|
for i, data in enumerate(res):
|
||||||
if i == 0:
|
if i == 0:
|
||||||
# Check first role message for stream=True
|
# Check first role message for stream=True
|
||||||
@@ -288,6 +290,8 @@ def test_chat_completion_with_timings_per_token():
|
|||||||
assert "predicted_per_second" in data["timings"]
|
assert "predicted_per_second" in data["timings"]
|
||||||
assert "predicted_n" in data["timings"]
|
assert "predicted_n" in data["timings"]
|
||||||
assert data["timings"]["predicted_n"] <= 10
|
assert data["timings"]["predicted_n"] <= 10
|
||||||
|
stats_received = True
|
||||||
|
assert stats_received
|
||||||
|
|
||||||
|
|
||||||
def test_logprobs():
|
def test_logprobs():
|
||||||
|
|||||||
Reference in New Issue
Block a user