server : simplify context limit logic

2025-11-04 09:32:00 +00:00 · 2025-10-28 12:38:17 +02:00
parent 7ebe7f77a1
commit edd2e8cfc7
1 changed files with 3 additions and 22 deletions
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2866,10 +2866,12 @@ struct server_context {

        // if context shifting is disabled, make sure that we don't run out of context
        if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) {
+            slot.truncated      = true;
            slot.stop           = STOP_TYPE_LIMIT;
            slot.has_next_token = false;

-            SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx);
+            SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
+                    slot.n_decoded, slot.n_prompt_tokens(), slot.n_past, slot.n_ctx);
        }

        // check the limits
@@ -2929,16 +2931,6 @@ struct server_context {
            }
        }

-        // if context shift is disabled, we stop when it reaches the context limit
-        if (slot.n_past >= slot.n_ctx) {
-            slot.truncated      = true;
-            slot.stop           = STOP_TYPE_LIMIT;
-            slot.has_next_token = false;
-
-            SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
-                    slot.n_decoded, slot.n_prompt_tokens(), slot.n_past, slot.n_ctx);
-        }
-
        if (llama_vocab_is_eog(vocab, result.tok)) {
            slot.stop           = STOP_TYPE_EOS;
            slot.has_next_token = false;
@@ -2946,17 +2938,6 @@ struct server_context {
            SLT_DBG(slot, "%s", "stopped by EOS\n");
        }

-        if (slot.task->params.n_predict < 1 && slot.n_prompt_tokens() + slot.n_decoded >= slot.n_ctx) {
-            slot.truncated      = true;
-            slot.stop           = STOP_TYPE_LIMIT;
-            slot.has_next_token = false; // stop prediction
-
-            SLT_WRN(slot,
-                    "n_predict (%d) is set for infinite generation. "
-                    "Limiting generated tokens to slot.n_ctx (%d) to avoid EOS-less generation infinite loop\n",
-                    slot.task->params.n_predict, slot.n_ctx);
-        }
-
        SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());

        return slot.has_next_token; // continue