From 7ebe7f77a197cd374cf7b6cf171390160a828f95 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 28 Oct 2025 11:39:07 +0200 Subject: [PATCH] server : use slot context size instead of training context size --- tools/server/server.cpp | 8 +++----- tools/server/tests/unit/test_ctx_shift.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 4124bffa40..daa6bb9b52 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2946,17 +2946,15 @@ struct server_context { SLT_DBG(slot, "%s", "stopped by EOS\n"); } - const auto n_ctx_train = llama_model_n_ctx_train(model); - - if (slot.task->params.n_predict < 1 && slot.n_prompt_tokens() + slot.n_decoded >= n_ctx_train) { + if (slot.task->params.n_predict < 1 && slot.n_prompt_tokens() + slot.n_decoded >= slot.n_ctx) { slot.truncated = true; slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; // stop prediction SLT_WRN(slot, "n_predict (%d) is set for infinite generation. " - "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n", - slot.task->params.n_predict, n_ctx_train); + "Limiting generated tokens to slot.n_ctx (%d) to avoid EOS-less generation infinite loop\n", + slot.task->params.n_predict, slot.n_ctx); } SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str()); diff --git a/tools/server/tests/unit/test_ctx_shift.py b/tools/server/tests/unit/test_ctx_shift.py index 4adbbde64f..7b047b7b3b 100644 --- a/tools/server/tests/unit/test_ctx_shift.py +++ b/tools/server/tests/unit/test_ctx_shift.py @@ -45,7 +45,7 @@ def test_ctx_shift_enabled(): @pytest.mark.parametrize("n_predict,n_token_output,truncated", [ (64, 64, False), - (-1, 120, True), + (-1, 248, True), # 8 tokens prompt + 248 tokens generated = 256 tokens total ]) def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool): global server