memory : remove KV cache size padding (#16812)

* memory : remove KV cache size padding

* cont : restore padding for n_kv tensor shape

* server : use slot context size instead of training context size

* server : simplify context limit logic
This commit is contained in:
Georgi Gerganov
2025-10-28 20:19:44 +02:00
committed by GitHub
parent a8ca18b4b8
commit 85a7d8677b
6 changed files with 14 additions and 54 deletions

View File

@@ -45,7 +45,7 @@ def test_ctx_shift_enabled():
@pytest.mark.parametrize("n_predict,n_token_output,truncated", [
(64, 64, False),
(-1, 120, True),
(-1, 248, True), # 8 tokens prompt + 248 tokens generated = 256 tokens total
])
def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
global server