server : support unified cache across slots (#16736)

* server : support unified context across slots

* cont : fix speculative decoding initialization

* context : fix n_ctx_per_seq computation

* server : purge slots one by one

* tests : add unified cache server tests

* llama : update per-seq context computation

* test-thread-safety : handle tiny training context of the input model

* server : fix server_tokens clear()

* server : use 4 slots + unified KV by default

* llama : add note about context size queries

* cont : update todos [no ci]

* context : do not cap the size of the context

* tests : adjust parameters to be CI friendlier

* context : add warning
This commit is contained in:
Georgi Gerganov
2025-11-02 18:14:04 +02:00
committed by GitHub
parent 87c9efc3b2
commit cd5e3b5754
12 changed files with 163 additions and 48 deletions

View File

@@ -18,7 +18,7 @@ def test_infill_without_input_extra():
"input_suffix": "}\n",
})
assert res.status_code == 200
assert match_regex("(Ann|small|shiny|Daddy)+", res.body["content"])
assert match_regex("(Ann|small|shiny|Daddy|Jimmy)+", res.body["content"])
def test_infill_with_input_extra():
@@ -34,7 +34,7 @@ def test_infill_with_input_extra():
"input_suffix": "}\n",
})
assert res.status_code == 200
assert match_regex("(Dad|excited|park)+", res.body["content"])
assert match_regex("(Dad|excited|park|Jimmy)+", res.body["content"])
@pytest.mark.parametrize("input_extra", [