From 2ca720c859b74cd1b031cfb560a4fb8739211486 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 23 Oct 2025 17:54:53 +0300 Subject: [PATCH] llama : update per-seq context computation --- include/llama.h | 1 + src/llama-context.cpp | 30 +++++++++++++------ src/llama-context.h | 10 +++---- src/llama-cparams.h | 1 + src/llama-model.cpp | 14 +++------ tools/server/server.cpp | 8 ++--- .../server/tests/unit/test_chat_completion.py | 8 ++--- tools/server/tests/unit/test_infill.py | 4 +-- 8 files changed, 40 insertions(+), 36 deletions(-) diff --git a/include/llama.h b/include/llama.h index 08e2ffa34c..532023557d 100644 --- a/include/llama.h +++ b/include/llama.h @@ -462,6 +462,7 @@ extern "C" { LLAMA_API bool llama_supports_rpc (void); LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); + LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 949d157c86..0190475458 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -112,9 +112,17 @@ llama_context::llama_context( } } + cparams.n_ctx_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max; + + if (cparams.n_ctx_seq > hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: capping n_ctx_seq (%u) to n_ctx_train (%u)\n", __func__, cparams.n_ctx_seq, hparams.n_ctx_train); + + cparams.n_ctx_seq = hparams.n_ctx_train; + } + LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); - LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq()); + LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq); LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn); @@ -123,14 +131,14 @@ llama_context::llama_context( LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - if (n_ctx_per_seq() < hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", - __func__, n_ctx_per_seq(), hparams.n_ctx_train); + if (cparams.n_ctx_seq < hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", + __func__, cparams.n_ctx_seq, hparams.n_ctx_train); } - if (n_ctx_per_seq() > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", - __func__, n_ctx_per_seq(), hparams.n_ctx_train); + if (cparams.n_ctx_seq > hparams.n_ctx_train) { + LLAMA_LOG_WARN("%s: n_ctx_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n", + __func__, cparams.n_ctx_seq, hparams.n_ctx_train); } if (!hparams.vocab_only) { @@ -451,8 +459,8 @@ uint32_t llama_context::n_ctx() const { return cparams.n_ctx; } -uint32_t llama_context::n_ctx_per_seq() const { - return cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max; +uint32_t llama_context::n_ctx_seq() const { + return cparams.n_ctx_seq; } uint32_t llama_context::n_batch() const { @@ -2381,6 +2389,10 @@ uint32_t llama_n_ctx(const llama_context * ctx) { return ctx->n_ctx(); } +uint32_t llama_n_ctx_seq(const llama_context * ctx) { + return ctx->n_ctx_seq(); +} + uint32_t llama_n_batch(const llama_context * ctx) { return ctx->n_batch(); } diff --git a/src/llama-context.h b/src/llama-context.h index ed6d82cb39..20cbd78955 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -43,11 +43,11 @@ struct llama_context { ggml_backend_sched_t get_sched() const; - uint32_t n_ctx() const; - uint32_t n_ctx_per_seq() const; - uint32_t n_batch() const; - uint32_t n_ubatch() const; - uint32_t n_seq_max() const; + uint32_t n_ctx() const; + uint32_t n_ctx_seq() const; + uint32_t n_batch() const; + uint32_t n_ubatch() const; + uint32_t n_seq_max() const; uint32_t n_threads() const; uint32_t n_threads_batch() const; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index eae7b839f4..fcef8fa976 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -8,6 +8,7 @@ struct llama_cparams { uint32_t n_ctx; // context size used during inference + uint32_t n_ctx_seq; // context for a single sequence uint32_t n_batch; uint32_t n_ubatch; uint32_t n_seq_max; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e0308c498d..427919b2cb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6668,14 +6668,14 @@ float llama_model::get_rope_freq_scale(const llama_cparams & cparams, int il) co } ggml_tensor * llama_model::get_rope_factors(const llama_cparams & cparams, int il) const { - const uint32_t n_ctx_per_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max; + const uint32_t n_ctx_seq = cparams.n_ctx_seq; // choose long/short freq factors based on the context size if (layers[il].rope_freqs != nullptr) { return layers[il].rope_freqs; } - if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) { + if (n_ctx_seq > hparams.n_ctx_orig_yarn) { return layers[il].rope_long; } @@ -20190,12 +20190,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* filter_attn */ std::move(filter_attn), /* filter_recr */ std::move(filter_recr)); } else { - uint32_t n_ctx_per_stream = cparams.n_ctx; - - if (!cparams.kv_unified) { - n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max; - } - llama_memory_i::layer_reuse_cb reuse = nullptr; if (arch == LLM_ARCH_GEMMA3N) { @@ -20219,7 +20213,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.offload_kqv, params.swa_full, cparams.kv_unified, - n_ctx_per_stream, + cparams.n_ctx_seq, cparams.n_seq_max, cparams.n_ubatch, 1, @@ -20235,7 +20229,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, !cparams.flash_attn, cparams.offload_kqv, cparams.kv_unified, - n_ctx_per_stream, + cparams.n_ctx_seq, cparams.n_seq_max, 1, hparams.n_swa, diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 2c3fbfd926..af2f237e5d 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2379,10 +2379,6 @@ struct server_context { llama_batch_free(batch); } - int32_t n_ctx_slot() const { - return params_base.kv_unified ? n_ctx : n_ctx / params_base.n_parallel; - } - bool load_model(const common_params & params) { SRV_INF("loading model '%s'\n", params.model.path.c_str()); @@ -2411,7 +2407,7 @@ struct server_context { params_dft.devices = params_base.speculative.devices; params_dft.model = params_base.speculative.model; - params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? n_ctx_slot() : params_base.speculative.n_ctx; + params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? llama_n_ctx_seq(ctx) : params_base.speculative.n_ctx; params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; params_dft.n_parallel = 1; params_dft.cache_type_k = params_base.speculative.cache_type_k; @@ -2506,7 +2502,7 @@ struct server_context { slot.id = i; slot.ctx = ctx; - slot.n_ctx = n_ctx_slot(); + slot.n_ctx = llama_n_ctx_seq(ctx); slot.mctx = mctx; slot.prompt.tokens.has_mtmd = mctx != nullptr; diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index d56d3d5f17..392e0efecd 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -433,21 +433,21 @@ def test_context_size_exceeded_stream(): @pytest.mark.parametrize( "n_batch,batch_count,reuse_cache", [ - (64, 15, False), + (64, 3, False), (64, 1, True), ] ) -def test_return_progresssss(n_batch, batch_count, reuse_cache): +def test_return_progress(n_batch, batch_count, reuse_cache): global server server.n_batch = n_batch - server.n_ctx = 2048 + server.n_ctx = 256 server.n_slots = 1 server.start() def make_cmpl_request(): return server.make_stream_request("POST", "/chat/completions", data={ "max_tokens": 10, "messages": [ - {"role": "user", "content": "This is a test" * 100}, + {"role": "user", "content": "This is a test" * 10}, ], "stream": True, "return_progress": True, diff --git a/tools/server/tests/unit/test_infill.py b/tools/server/tests/unit/test_infill.py index 73dacdae81..cd1a391b4a 100644 --- a/tools/server/tests/unit/test_infill.py +++ b/tools/server/tests/unit/test_infill.py @@ -18,7 +18,7 @@ def test_infill_without_input_extra(): "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"]) + assert match_regex("(Ann|small|shiny|Daddy|Jimmy)+", res.body["content"]) def test_infill_with_input_extra(): @@ -34,7 +34,7 @@ def test_infill_with_input_extra(): "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(Dad|excited|park)+", res.body["content"]) + assert match_regex("(Dad|excited|park|Jimmy)+", res.body["content"]) @pytest.mark.parametrize("input_extra", [