mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	tests : add unified cache server tests
This commit is contained in:
		@@ -368,6 +368,37 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
 | 
			
		||||
        # assert match_regex(re_content, res.body["content"])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "n_ctx,n_slots,n_predict_vals,expected_success",
 | 
			
		||||
    [
 | 
			
		||||
        (256, 4, [80, 40, 80, 80], [True,  True,  True,  True]),
 | 
			
		||||
        (256, 4, [70, 70, 70, 70], [False, False, False, False]),
 | 
			
		||||
        (256, 4, [90, 90, 40, 90], [False, False, True,  False]),
 | 
			
		||||
        (256, 4, [90, 90, 40, 80], [True,  True,  True,  True]),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_completion_unified(n_ctx, n_slots, n_predict_vals, expected_success):
 | 
			
		||||
    global server
 | 
			
		||||
    server.n_slots = n_slots
 | 
			
		||||
    server.kv_unified = True
 | 
			
		||||
    server.n_ctx = n_ctx
 | 
			
		||||
    server.start()
 | 
			
		||||
    prompt = "A"
 | 
			
		||||
    tasks = []
 | 
			
		||||
    for n_predict in n_predict_vals:
 | 
			
		||||
        tasks.append((server.make_request, ("POST", "/completion", {"prompt": prompt, "n_predict": n_predict})))
 | 
			
		||||
    results = parallel_function_calls(tasks)
 | 
			
		||||
    for res, n_predict, expect_ok in zip(results, n_predict_vals, expected_success):
 | 
			
		||||
        if expect_ok:
 | 
			
		||||
            assert res.status_code == 200
 | 
			
		||||
            assert "content" in res.body
 | 
			
		||||
            if "timings" in res.body:
 | 
			
		||||
                assert res.body["timings"]["predicted_n"] == n_predict
 | 
			
		||||
        else:
 | 
			
		||||
            assert res.status_code == 500
 | 
			
		||||
            assert "content" not in res.body
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "prompt,n_predict,response_fields",
 | 
			
		||||
    [
 | 
			
		||||
 
 | 
			
		||||
@@ -78,6 +78,7 @@ class ServerProcess:
 | 
			
		||||
    server_embeddings: bool | None = False
 | 
			
		||||
    server_reranking: bool | None = False
 | 
			
		||||
    server_metrics: bool | None = False
 | 
			
		||||
    kv_unified: bool | None = False
 | 
			
		||||
    server_slots: bool | None = False
 | 
			
		||||
    pooling: str | None = None
 | 
			
		||||
    draft: int | None = None
 | 
			
		||||
@@ -159,6 +160,8 @@ class ServerProcess:
 | 
			
		||||
            server_args.append("--reranking")
 | 
			
		||||
        if self.server_metrics:
 | 
			
		||||
            server_args.append("--metrics")
 | 
			
		||||
        if self.kv_unified:
 | 
			
		||||
            server_args.append("--kv-unified")
 | 
			
		||||
        if self.server_slots:
 | 
			
		||||
            server_args.append("--slots")
 | 
			
		||||
        else:
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user