mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	server: add test for token probs (#7347)
This commit is contained in:
		@@ -23,6 +23,7 @@ from prometheus_client import parser
 | 
			
		||||
def step_server_config(context, server_fqdn, server_port):
 | 
			
		||||
    context.server_fqdn = server_fqdn
 | 
			
		||||
    context.server_port = int(server_port)
 | 
			
		||||
    context.n_threads = None
 | 
			
		||||
    context.n_gpu_layer = None
 | 
			
		||||
    if 'PORT' in os.environ:
 | 
			
		||||
        context.server_port = int(os.environ['PORT'])
 | 
			
		||||
@@ -109,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
 | 
			
		||||
    context.n_gpu_layer = ngl
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('{n_threads:d} threads')
 | 
			
		||||
def step_n_threads(context, n_threads):
 | 
			
		||||
    context.n_thread = n_threads
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('{draft:d} as draft')
 | 
			
		||||
def step_draft(context, draft):
 | 
			
		||||
    context.draft = draft
 | 
			
		||||
@@ -274,13 +280,22 @@ async def step_predictions_equal(context):
 | 
			
		||||
 | 
			
		||||
@step('all predictions are different')
 | 
			
		||||
@async_run_until_complete
 | 
			
		||||
async def step_predictions_equal(context):
 | 
			
		||||
async def step_predictions_different(context):
 | 
			
		||||
    n_completions = await gather_tasks_results(context)
 | 
			
		||||
    assert n_completions >= 2, "need at least 2 completions"
 | 
			
		||||
    assert_all_predictions_different(context.tasks_result)
 | 
			
		||||
    context.tasks_result = []
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('all token probabilities are equal')
 | 
			
		||||
@async_run_until_complete
 | 
			
		||||
async def step_token_probabilities_equal(context):
 | 
			
		||||
    n_completions = await gather_tasks_results(context)
 | 
			
		||||
    assert n_completions >= 2, "need at least 2 completions"
 | 
			
		||||
    assert_all_token_probabilities_equal(context.tasks_result)
 | 
			
		||||
    context.tasks_result = []
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@step('the completion is  truncated')
 | 
			
		||||
def step_assert_completion_truncated(context):
 | 
			
		||||
    step_assert_completion_truncated(context, '')
 | 
			
		||||
@@ -869,6 +884,7 @@ async def request_completion(prompt,
 | 
			
		||||
                                    "id_slot": id_slot,
 | 
			
		||||
                                    "seed": seed if seed is not None else 42,
 | 
			
		||||
                                    "temperature": temperature if temperature is not None else "0.8f",
 | 
			
		||||
                                    "n_probs": 2,
 | 
			
		||||
                                },
 | 
			
		||||
                                headers=headers,
 | 
			
		||||
                                timeout=3600) as response:
 | 
			
		||||
@@ -1123,6 +1139,23 @@ def assert_all_predictions_different(completion_responses):
 | 
			
		||||
        assert content_i != content_j, "contents not different"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def assert_all_token_probabilities_equal(completion_responses):
 | 
			
		||||
    n_predict = len(completion_responses[0]['completion_probabilities'])
 | 
			
		||||
    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
 | 
			
		||||
        for pos in range(n_predict):
 | 
			
		||||
            for i, response_i in enumerate(completion_responses):
 | 
			
		||||
                probs_i = response_i['completion_probabilities'][pos]['probs']
 | 
			
		||||
                print(f"pos {pos}, probs {i}: {probs_i}")
 | 
			
		||||
    for pos in range(n_predict):
 | 
			
		||||
        for i, response_i in enumerate(completion_responses):
 | 
			
		||||
            probs_i = response_i['completion_probabilities'][pos]['probs']
 | 
			
		||||
            for j, response_j in enumerate(completion_responses):
 | 
			
		||||
                if i == j:
 | 
			
		||||
                    continue
 | 
			
		||||
                probs_j = response_j['completion_probabilities'][pos]['probs']
 | 
			
		||||
            assert probs_i == probs_j, "contents not equal"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
async def gather_tasks_results(context):
 | 
			
		||||
    n_tasks = len(context.concurrent_tasks)
 | 
			
		||||
    if context.debug:
 | 
			
		||||
@@ -1261,6 +1294,8 @@ def start_server_background(context):
 | 
			
		||||
        server_args.extend(['--batch-size', context.n_batch])
 | 
			
		||||
    if context.n_ubatch:
 | 
			
		||||
        server_args.extend(['--ubatch-size', context.n_ubatch])
 | 
			
		||||
    if context.n_threads:
 | 
			
		||||
        server_args.extend(['--threads', context.threads])
 | 
			
		||||
    if context.n_gpu_layer:
 | 
			
		||||
        server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
 | 
			
		||||
    if context.draft is not None:
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user