diff --git a/tools/server/server.cpp b/tools/server/server.cpp index b84c119579..72ad53550f 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2385,6 +2385,10 @@ struct server_context { llama_batch_free(batch); } + int32_t n_ctx_slot() const { + return params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel; + } + bool load_model(const common_params & params) { SRV_INF("loading model '%s'\n", params.model.path.c_str()); @@ -2413,7 +2417,7 @@ struct server_context { params_dft.devices = params_base.speculative.devices; params_dft.model = params_base.speculative.model; - params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? slots.front().n_ctx : params_base.speculative.n_ctx; + params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? n_ctx_slot() : params_base.speculative.n_ctx; params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; params_dft.n_parallel = 1; params_dft.cache_type_k = params_base.speculative.cache_type_k; @@ -2501,8 +2505,6 @@ struct server_context { } void init() { - const int32_t n_ctx_slot = params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel; - SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); for (int i = 0; i < params_base.n_parallel; i++) { @@ -2510,7 +2512,7 @@ struct server_context { slot.id = i; slot.ctx = ctx; - slot.n_ctx = n_ctx_slot; + slot.n_ctx = n_ctx_slot(); slot.mctx = mctx; slot.prompt.tokens.has_mtmd = mctx != nullptr; @@ -2533,7 +2535,7 @@ struct server_context { } } - SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); + SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx); slot.callback_on_release = [this](int) { queue_tasks.pop_deferred_task(); @@ -3718,7 +3720,7 @@ struct server_context { slot.n_past = 0; slot.state = SLOT_STATE_PROCESSING_PROMPT; - SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", + SLT_INF(slot, "new prompt, n_ctx = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.task->params.n_keep, slot.n_prompt_tokens()); // print prompt tokens (for debugging)