server : support unified cache across slots (#16736)

* server : support unified context across slots * cont : fix speculative decoding initialization * context : fix n_ctx_per_seq computation * server : purge slots one by one * tests : add unified cache server tests * llama : update per-seq context computation * test-thread-safety : handle tiny training context of the input model * server : fix server_tokens clear() * server : use 4 slots + unified KV by default * llama : add note about context size queries * cont : update todos [no ci] * context : do not cap the size of the context * tests : adjust parameters to be CI friendlier * context : add warning
2025-11-06 09:46:50 +00:00 · 2025-11-02 18:14:04 +02:00
parent 87c9efc3b2
commit cd5e3b5754
12 changed files with 163 additions and 48 deletions
--- a/tools/server/tests/unit/test_infill.py
+++ b/tools/server/tests/unit/test_infill.py
@@ -18,7 +18,7 @@ def test_infill_without_input_extra():
        "input_suffix": "}\n",
    })
    assert res.status_code == 200
-    assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"])
+    assert match_regex("(Ann|small|shiny|Daddy|Jimmy)+", res.body["content"])


 def test_infill_with_input_extra():
@@ -34,7 +34,7 @@ def test_infill_with_input_extra():
        "input_suffix": "}\n",
    })
    assert res.status_code == 200
-    assert match_regex("(Dad|excited|park)+", res.body["content"])
+    assert match_regex("(Dad|excited|park|Jimmy)+", res.body["content"])


@pytest.mark.parametrize("input_extra", [