mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-06 09:46:50 +00:00
server : do context shift only while generating (#17000)
This commit is contained in:
@@ -3587,7 +3587,7 @@ struct server_context {
|
|||||||
// apply context-shift if needed
|
// apply context-shift if needed
|
||||||
// TODO: simplify and improve
|
// TODO: simplify and improve
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
if (slot.is_processing() && slot.prompt.n_tokens() + 1 >= slot.n_ctx) {
|
if (slot.state == SLOT_STATE_GENERATING && slot.prompt.n_tokens() + 1 >= slot.n_ctx) {
|
||||||
if (!params_base.ctx_shift) {
|
if (!params_base.ctx_shift) {
|
||||||
// this check is redundant (for good)
|
// this check is redundant (for good)
|
||||||
// we should never get here, because generation should already stopped in process_token()
|
// we should never get here, because generation should already stopped in process_token()
|
||||||
|
|||||||
Reference in New Issue
Block a user