diff --git a/tools/server/server.cpp b/tools/server/server.cpp index daa6bb9b52..cb794ab647 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2866,10 +2866,12 @@ struct server_context { // if context shifting is disabled, make sure that we don't run out of context if (!params_base.ctx_shift && slot.n_past + 1 >= slot.n_ctx) { + slot.truncated = true; slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; - SLT_DBG(slot, "stopped due to running out of context, n_past = %d, n_ctx = %d\n", slot.n_past, slot.n_ctx); + SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n", + slot.n_decoded, slot.n_prompt_tokens(), slot.n_past, slot.n_ctx); } // check the limits @@ -2929,16 +2931,6 @@ struct server_context { } } - // if context shift is disabled, we stop when it reaches the context limit - if (slot.n_past >= slot.n_ctx) { - slot.truncated = true; - slot.stop = STOP_TYPE_LIMIT; - slot.has_next_token = false; - - SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n", - slot.n_decoded, slot.n_prompt_tokens(), slot.n_past, slot.n_ctx); - } - if (llama_vocab_is_eog(vocab, result.tok)) { slot.stop = STOP_TYPE_EOS; slot.has_next_token = false; @@ -2946,17 +2938,6 @@ struct server_context { SLT_DBG(slot, "%s", "stopped by EOS\n"); } - if (slot.task->params.n_predict < 1 && slot.n_prompt_tokens() + slot.n_decoded >= slot.n_ctx) { - slot.truncated = true; - slot.stop = STOP_TYPE_LIMIT; - slot.has_next_token = false; // stop prediction - - SLT_WRN(slot, - "n_predict (%d) is set for infinite generation. " - "Limiting generated tokens to slot.n_ctx (%d) to avoid EOS-less generation infinite loop\n", - slot.task->params.n_predict, slot.n_ctx); - } - SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str()); return slot.has_next_token; // continue