kv-cache : pad the cache size to 256 for performance (#17046)

* kv-cache : pad the size of the small SWA cache for performance * context : pad the total context to 256 * cont : future-proof the swa pad * server : adjust test params to new logic
2025-11-10 10:27:03 +00:00 · 2025-11-07 20:03:25 +02:00
parent 9eb9a1331d
commit 16bcc1259d
4 changed files with 14 additions and 7 deletions
--- a/tools/server/tests/unit/test_speculative.py
+++ b/tools/server/tests/unit/test_speculative.py
@@ -77,10 +77,10 @@ def test_different_draft_min_draft_max():

 def test_slot_ctx_not_exceeded():
    global server
-    server.n_ctx = 64
+    server.n_ctx = 256
    server.start()
    res = server.make_request("POST", "/completion", data={
-        "prompt": "Hello " * 56,
+        "prompt": "Hello " * 248,
        "temperature": 0.0,
        "top_k": 1,
        "speculative.p_min": 0.0,
@@ -91,19 +91,19 @@ def test_slot_ctx_not_exceeded():

 def test_with_ctx_shift():
    global server
-    server.n_ctx = 64
+    server.n_ctx = 256
    server.enable_ctx_shift = True
    server.start()
    res = server.make_request("POST", "/completion", data={
-        "prompt": "Hello " * 56,
+        "prompt": "Hello " * 248,
        "temperature": 0.0,
        "top_k": 1,
-        "n_predict": 64,
+        "n_predict": 256,
        "speculative.p_min": 0.0,
    })
    assert res.status_code == 200
    assert len(res.body["content"]) > 0
-    assert res.body["tokens_predicted"] == 64
+    assert res.body["tokens_predicted"] == 256
    assert res.body["truncated"] == True