mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-29 08:41:22 +00:00
server : return HTTP 400 if prompt exceeds context length (#16486)
In streaming mode when prompt exceeds context length, the server returns HTTP 200 status code with a JSON error in the body. This is very confusing and inconsistent with all other inference engines which return HTTP 4xx error in this case. This patch fixes this problem and makes the server return HTTP 400 in such cases.
This commit is contained in:
committed by
GitHub
parent
cdb6da468c
commit
68ee98ae18
@@ -3727,7 +3727,7 @@ struct server_context {
|
||||
}
|
||||
} else {
|
||||
if (slot.n_prompt_tokens() >= slot.n_ctx) {
|
||||
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
|
||||
send_error(slot, "the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
|
||||
slot.release();
|
||||
continue;
|
||||
}
|
||||
@@ -4955,9 +4955,17 @@ int main(int argc, char ** argv) {
|
||||
// Everything else, including multimodal completions.
|
||||
inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
|
||||
}
|
||||
|
||||
const size_t n_ctx_slot = ctx_server.n_ctx / ctx_server.params_base.n_parallel;
|
||||
tasks.reserve(inputs.size());
|
||||
for (size_t i = 0; i < inputs.size(); i++) {
|
||||
auto n_prompt_tokens = inputs[i].size();
|
||||
if (n_prompt_tokens >= n_ctx_slot) {
|
||||
json error_data = format_error_response("the request exceeds the available context size, try increasing it", ERROR_TYPE_EXCEED_CONTEXT_SIZE);
|
||||
error_data["n_prompt_tokens"] = n_prompt_tokens;
|
||||
error_data["n_ctx"] = n_ctx_slot;
|
||||
res_error(res, error_data);
|
||||
return;
|
||||
}
|
||||
server_task task = server_task(type);
|
||||
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
|
||||
Reference in New Issue
Block a user