From 9d342994f53fef3cbbaa85d9acd9464a4843938c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 30 Oct 2025 20:16:20 +0200 Subject: [PATCH] llama : add note about context size queries --- include/llama.h | 4 +++- src/llama-context.cpp | 12 +++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/llama.h b/include/llama.h index 532023557d..caca361516 100644 --- a/include/llama.h +++ b/include/llama.h @@ -461,6 +461,8 @@ extern "C" { LLAMA_API bool llama_supports_gpu_offload(void); LLAMA_API bool llama_supports_rpc (void); + // NOTE: After creating a llama_context, it is recommended to query the actual values using these functions + // In some cases the requested values via llama_context_params may differ from the actual values used by the context LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ctx_seq (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); @@ -586,7 +588,7 @@ extern "C" { LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size); // Manually free a LoRA adapter - // Note: loaded adapters will be free when the associated model is deleted + // NOTE: loaded adapters will be free when the associated model is deleted LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); // Get the invocation tokens if the current lora is an alora diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0190475458..e949afab21 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -112,7 +112,11 @@ llama_context::llama_context( } } - cparams.n_ctx_seq = cparams.kv_unified ? cparams.n_ctx : cparams.n_ctx / cparams.n_seq_max; + if (cparams.kv_unified) { + cparams.n_ctx_seq = cparams.n_ctx; + } else { + cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max; + } if (cparams.n_ctx_seq > hparams.n_ctx_train) { LLAMA_LOG_WARN("%s: capping n_ctx_seq (%u) to n_ctx_train (%u)\n", __func__, cparams.n_ctx_seq, hparams.n_ctx_train); @@ -120,6 +124,12 @@ llama_context::llama_context( cparams.n_ctx_seq = hparams.n_ctx_train; } + if (cparams.kv_unified) { + cparams.n_ctx = cparams.n_ctx_seq; + } else { + cparams.n_ctx = cparams.n_ctx_seq * cparams.n_seq_max; + } + LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); LLAMA_LOG_INFO("%s: n_ctx_seq = %u\n", __func__, cparams.n_ctx_seq);