mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-29 08:41:22 +00:00
tests : add unified cache server tests
This commit is contained in:
@@ -368,6 +368,37 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
|
|||||||
# assert match_regex(re_content, res.body["content"])
|
# assert match_regex(re_content, res.body["content"])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"n_ctx,n_slots,n_predict_vals,expected_success",
|
||||||
|
[
|
||||||
|
(256, 4, [80, 40, 80, 80], [True, True, True, True]),
|
||||||
|
(256, 4, [70, 70, 70, 70], [False, False, False, False]),
|
||||||
|
(256, 4, [90, 90, 40, 90], [False, False, True, False]),
|
||||||
|
(256, 4, [90, 90, 40, 80], [True, True, True, True]),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
|
||||||
|
global server
|
||||||
|
server.n_slots = n_slots
|
||||||
|
server.kv_unified = True
|
||||||
|
server.n_ctx = n_ctx
|
||||||
|
server.start()
|
||||||
|
prompt = "A"
|
||||||
|
tasks = []
|
||||||
|
for n_predict in n_predict_vals:
|
||||||
|
tasks.append((server.make_request, ("POST", "/completion", {"prompt": prompt, "n_predict": n_predict})))
|
||||||
|
results = parallel_function_calls(tasks)
|
||||||
|
for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
|
||||||
|
if expect_ok:
|
||||||
|
assert res.status_code == 200
|
||||||
|
assert "content" in res.body
|
||||||
|
if "timings" in res.body:
|
||||||
|
assert res.body["timings"]["predicted_n"] == n_predict
|
||||||
|
else:
|
||||||
|
assert res.status_code == 500
|
||||||
|
assert "content" not in res.body
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"prompt,n_predict,response_fields",
|
"prompt,n_predict,response_fields",
|
||||||
[
|
[
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ class ServerProcess:
|
|||||||
server_embeddings: bool | None = False
|
server_embeddings: bool | None = False
|
||||||
server_reranking: bool | None = False
|
server_reranking: bool | None = False
|
||||||
server_metrics: bool | None = False
|
server_metrics: bool | None = False
|
||||||
|
kv_unified: bool | None = False
|
||||||
server_slots: bool | None = False
|
server_slots: bool | None = False
|
||||||
pooling: str | None = None
|
pooling: str | None = None
|
||||||
draft: int | None = None
|
draft: int | None = None
|
||||||
@@ -159,6 +160,8 @@ class ServerProcess:
|
|||||||
server_args.append("--reranking")
|
server_args.append("--reranking")
|
||||||
if self.server_metrics:
|
if self.server_metrics:
|
||||||
server_args.append("--metrics")
|
server_args.append("--metrics")
|
||||||
|
if self.kv_unified:
|
||||||
|
server_args.append("--kv-unified")
|
||||||
if self.server_slots:
|
if self.server_slots:
|
||||||
server_args.append("--slots")
|
server_args.append("--slots")
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user