mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-06 09:46:50 +00:00
server : do context shift only while generating (#17000)
This commit is contained in:
@@ -3587,7 +3587,7 @@ struct server_context {
|
||||
// apply context-shift if needed
|
||||
// TODO: simplify and improve
|
||||
for (server_slot & slot : slots) {
|
||||
if (slot.is_processing() && slot.prompt.n_tokens() + 1 >= slot.n_ctx) {
|
||||
if (slot.state == SLOT_STATE_GENERATING && slot.prompt.n_tokens() + 1 >= slot.n_ctx) {
|
||||
if (!params_base.ctx_shift) {
|
||||
// this check is redundant (for good)
|
||||
// we should never get here, because generation should already stopped in process_token()
|
||||
|
||||
Reference in New Issue
Block a user