mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-01 09:01:57 +00:00
memory : remove KV cache size padding (#16812)
* memory : remove KV cache size padding * cont : restore padding for n_kv tensor shape * server : use slot context size instead of training context size * server : simplify context limit logic
This commit is contained in:
@@ -45,7 +45,7 @@ def test_ctx_shift_enabled():
|
||||
|
||||
@pytest.mark.parametrize("n_predict,n_token_output,truncated", [
|
||||
(64, 64, False),
|
||||
(-1, 120, True),
|
||||
(-1, 248, True), # 8 tokens prompt + 248 tokens generated = 256 tokens total
|
||||
])
|
||||
def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
|
||||
global server
|
||||
|
||||
Reference in New Issue
Block a user