mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	tool-call: fix Qwen 2.5 Coder support, add micro benchmarks, support trigger patterns for lazy grammars (#12034)
				
					
				
			* sampler: turn lazy grammar trigger words to regexes * add scripts/tool_bench.sh & .py * constrain llama json output regardless of function name if matches at beginning * update relaxed newline space rule in grammar tests * support add_generation_prompt query parameter (useful for /apply_template) * Update src/llama-grammar.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										236
									
								
								examples/server/tests/unit/test_tool_call.py
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										236
									
								
								examples/server/tests/unit/test_tool_call.py
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							| @@ -1,4 +1,12 @@ | ||||
| #!/usr/bin/env python | ||||
| import pytest | ||||
|  | ||||
| # ensure grandparent path is in sys.path | ||||
| from pathlib import Path | ||||
| import sys | ||||
| path = Path(__file__).resolve().parents[1] | ||||
| sys.path.insert(0, str(path)) | ||||
|  | ||||
| from utils import * | ||||
|  | ||||
| server: ServerProcess | ||||
| @@ -66,15 +74,8 @@ WEATHER_TOOL = { | ||||
| } | ||||
|  | ||||
|  | ||||
| def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None): | ||||
|     global server | ||||
|     n_predict = 512 | ||||
|     # server = ServerPreset.stories15m_moe() | ||||
|     server.jinja = True | ||||
|     server.n_predict = n_predict | ||||
|     server.chat_template_file = f'../../../models/templates/{template_name}.jinja' | ||||
|     server.start(timeout_seconds=TIMEOUT_SERVER_START) | ||||
|     res = server.make_request("POST", "/chat/completions", data={ | ||||
| def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs): | ||||
|     res = server.make_request("POST", "/v1/chat/completions", data={ | ||||
|         "max_tokens": n_predict, | ||||
|         "messages": [ | ||||
|             {"role": "system", "content": "You are a coding assistant."}, | ||||
| @@ -83,16 +84,14 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a | ||||
|         "tool_choice": "required", | ||||
|         "tools": [tool], | ||||
|         "parallel_tool_calls": False, | ||||
|         "temperature": 0.0, | ||||
|         "top_k": 1, | ||||
|         "top_p": 1.0, | ||||
|         **kwargs, | ||||
|     }) | ||||
|     assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" | ||||
|     choice = res.body["choices"][0] | ||||
|     tool_calls = choice["message"].get("tool_calls") | ||||
|     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' | ||||
|     tool_call = tool_calls[0] | ||||
|     assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' | ||||
|     assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' | ||||
|     expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"] | ||||
|     assert expected_function_name == tool_call["function"]["name"] | ||||
|     actual_arguments = tool_call["function"]["arguments"] | ||||
| @@ -108,7 +107,14 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a | ||||
|     ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"), | ||||
| ]) | ||||
| def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None): | ||||
|     do_test_completion_with_required_tool_tiny(template_name, tool, argument_key) | ||||
|     global server | ||||
|     n_predict = 512 | ||||
|     # server = ServerPreset.stories15m_moe() | ||||
|     server.jinja = True | ||||
|     server.n_predict = n_predict | ||||
|     server.chat_template_file = f'../../../models/templates/{template_name}.jinja' | ||||
|     server.start(timeout_seconds=TIMEOUT_SERVER_START) | ||||
|     do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, temperature=0.0, top_k=1, top_p=1.0) | ||||
|  | ||||
|  | ||||
| @pytest.mark.slow | ||||
| @@ -130,10 +136,17 @@ def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, | ||||
|     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"), | ||||
|     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"), | ||||
|     ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"), | ||||
|     ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"), | ||||
|     # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"), | ||||
| ]) | ||||
| def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None): | ||||
|     do_test_completion_with_required_tool_tiny(template_name, tool, argument_key) | ||||
|     global server | ||||
|     n_predict = 512 | ||||
|     # server = ServerPreset.stories15m_moe() | ||||
|     server.jinja = True | ||||
|     server.n_predict = n_predict | ||||
|     server.chat_template_file = f'../../../models/templates/{template_name}.jinja' | ||||
|     server.start(timeout_seconds=TIMEOUT_SERVER_START) | ||||
|     do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict) | ||||
|  | ||||
|  | ||||
| @pytest.mark.slow | ||||
| @@ -142,25 +155,33 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), | ||||
|  | ||||
|     # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. | ||||
|     (TEST_TOOL,    "success",  "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              "chatml"), | ||||
|  | ||||
|     (TEST_TOOL,    "success",  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|  | ||||
|     (TEST_TOOL,    "success",  "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|  | ||||
|     (TEST_TOOL,    "success",  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|  | ||||
|     (TEST_TOOL,    "success",  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"), | ||||
|  | ||||
|     (TEST_TOOL,    "success",  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), | ||||
|     # (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), | ||||
|  | ||||
|     (TEST_TOOL,    "success",  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), | ||||
|     # (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"), | ||||
|  | ||||
|     (TEST_TOOL,    "success",  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), | ||||
| @@ -176,10 +197,10 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, | ||||
|  | ||||
|     (TEST_TOOL,    "success",  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)), | ||||
|     # (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|     # TODO: fix these | ||||
|     # (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
|     # (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|  | ||||
|     (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
|     (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
| ]) | ||||
| def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): | ||||
|     global server | ||||
| @@ -197,7 +218,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | ||||
|     elif isinstance(template_override, str): | ||||
|         server.chat_template = template_override | ||||
|     server.start(timeout_seconds=TIMEOUT_SERVER_START) | ||||
|     res = server.make_request("POST", "/chat/completions", data={ | ||||
|     res = server.make_request("POST", "/v1/chat/completions", data={ | ||||
|         "max_tokens": n_predict, | ||||
|         "messages": [ | ||||
|             {"role": "system", "content": "You are a coding assistant."}, | ||||
| @@ -215,7 +236,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | ||||
|     tool_calls = choice["message"].get("tool_calls") | ||||
|     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' | ||||
|     tool_call = tool_calls[0] | ||||
|     assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' | ||||
|     # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' | ||||
|     expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"] | ||||
|     assert expected_function_name == tool_call["function"]["name"] | ||||
|     actual_arguments = tool_call["function"]["arguments"] | ||||
| @@ -225,13 +246,8 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | ||||
|         assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}" | ||||
|  | ||||
|  | ||||
| def do_test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None): | ||||
|     global server | ||||
|     server.jinja = True | ||||
|     server.n_predict = n_predict | ||||
|     server.chat_template_file = f'../../../models/templates/{template_name}.jinja' | ||||
|     server.start(timeout_seconds=TIMEOUT_SERVER_START) | ||||
|     res = server.make_request("POST", "/chat/completions", data={ | ||||
| def do_test_completion_without_tool_call(server: ServerProcess, n_predict: int, tools: list[dict], tool_choice: str | None, **kwargs): | ||||
|     res = server.make_request("POST", "/v1/chat/completions", data={ | ||||
|         "max_tokens": n_predict, | ||||
|         "messages": [ | ||||
|             {"role": "system", "content": "You are a coding assistant."}, | ||||
| @@ -239,9 +255,7 @@ def do_test_completion_without_tool_call(template_name: str, n_predict: int, too | ||||
|         ], | ||||
|         "tools": tools if tools else None, | ||||
|         "tool_choice": tool_choice, | ||||
|         "temperature": 0.0, | ||||
|         "top_k": 1, | ||||
|         "top_p": 1.0, | ||||
|         **kwargs, | ||||
|     }, timeout=TIMEOUT_HTTP_REQUEST) | ||||
|     assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" | ||||
|     choice = res.body["choices"][0] | ||||
| @@ -254,7 +268,12 @@ def do_test_completion_without_tool_call(template_name: str, n_predict: int, too | ||||
|     ("meta-llama-Llama-3.3-70B-Instruct",         128, [PYTHON_TOOL], 'none'), | ||||
| ]) | ||||
| def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None): | ||||
|     do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice) | ||||
|     global server | ||||
|     server.jinja = True | ||||
|     server.n_predict = n_predict | ||||
|     server.chat_template_file = f'../../../models/templates/{template_name}.jinja' | ||||
|     server.start(timeout_seconds=TIMEOUT_SERVER_START) | ||||
|     do_test_completion_without_tool_call(server, n_predict, tools, tool_choice) | ||||
|  | ||||
|  | ||||
| @pytest.mark.slow | ||||
| @@ -270,7 +289,12 @@ def test_completion_without_tool_call_fast(template_name: str, n_predict: int, t | ||||
|     ("meta-llama-Llama-3.2-3B-Instruct",              256, [PYTHON_TOOL], 'none'), | ||||
| ]) | ||||
| def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None): | ||||
|     do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice) | ||||
|     global server | ||||
|     server.jinja = True | ||||
|     server.n_predict = n_predict | ||||
|     server.chat_template_file = f'../../../models/templates/{template_name}.jinja' | ||||
|     server.start(timeout_seconds=TIMEOUT_SERVER_START) | ||||
|     do_test_completion_without_tool_call(server, n_predict, tools, tool_choice) | ||||
|  | ||||
|  | ||||
| @pytest.mark.slow | ||||
| @@ -281,6 +305,12 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t | ||||
|     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None), | ||||
|     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|  | ||||
|     ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None), | ||||
|     ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|  | ||||
|     ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None), | ||||
|     ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|  | ||||
|     ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None), | ||||
|     ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"), | ||||
|  | ||||
| @@ -324,48 +354,52 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] | | ||||
|     elif isinstance(template_override, str): | ||||
|         server.chat_template = template_override | ||||
|     server.start(timeout_seconds=TIMEOUT_SERVER_START) | ||||
|     res = server.make_request("POST", "/chat/completions", data={ | ||||
|         "max_tokens": n_predict, | ||||
|     do_test_weather(server, max_tokens=n_predict) | ||||
|  | ||||
|  | ||||
| def do_test_weather(server: ServerProcess, **kwargs): | ||||
|     res = server.make_request("POST", "/v1/chat/completions", data={ | ||||
|         "messages": [ | ||||
|             {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."}, | ||||
|             {"role": "user", "content": "What is the weather in Istanbul?"}, | ||||
|         ], | ||||
|         "tools": [WEATHER_TOOL], | ||||
|         **kwargs, | ||||
|     }, timeout=TIMEOUT_HTTP_REQUEST) | ||||
|     assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" | ||||
|     choice = res.body["choices"][0] | ||||
|     tool_calls = choice["message"].get("tool_calls") | ||||
|     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' | ||||
|     tool_call = tool_calls[0] | ||||
|     assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' | ||||
|     assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"] | ||||
|     # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' | ||||
|     assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}' | ||||
|     actual_arguments = json.loads(tool_call["function"]["arguments"]) | ||||
|     assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}" | ||||
|     location = actual_arguments["location"] | ||||
|     assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}" | ||||
|     assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}' | ||||
|     assert re.match('^Istanbul(( |, ?)(TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}' | ||||
|  | ||||
|  | ||||
| @pytest.mark.slow | ||||
| @pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [ | ||||
|     (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       "chatml"), | ||||
|     (None,                                           128,  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",         None), | ||||
|     (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None), | ||||
|     (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"), | ||||
|     (None,                                           128,  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",         "chatml"), | ||||
|     (None,                                           128,  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",     ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), | ||||
|     (None,                                           128,  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",       ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), | ||||
|     (None,                                           128,  "bartowski/functionary-small-v3.2-GGUF:Q8_0",        ("meetkai/functionary-medium-v3.2", None)), | ||||
|     (None,                                           128,  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M",  None), | ||||
|     (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  None), | ||||
|     (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  "chatml"), | ||||
|     (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None), | ||||
|     ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), | ||||
|  | ||||
|     # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value) | ||||
|     ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
|     # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), | ||||
|     # (None,                                           128,  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M",  None), | ||||
|     # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
| ]) | ||||
| def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None): | ||||
|     global server | ||||
|     # n_predict = 512 | ||||
|     server.n_slots = 1 | ||||
|     server.jinja = True | ||||
|     server.n_ctx = 8192 * 2 | ||||
| @@ -379,10 +413,14 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, | ||||
|     elif isinstance(template_override, str): | ||||
|         server.chat_template = template_override | ||||
|     server.start(timeout_seconds=TIMEOUT_SERVER_START) | ||||
|     res = server.make_request("POST", "/chat/completions", data={ | ||||
|     do_test_calc_result(server, result_override, n_predict) | ||||
|  | ||||
|  | ||||
| def do_test_calc_result(server: ServerProcess, result_override: str | None, n_predict: int, **kwargs): | ||||
|     res = server.make_request("POST", "/v1/chat/completions", data={ | ||||
|         "max_tokens": n_predict, | ||||
|         "messages": [ | ||||
|             {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things, and provide very concise answers. Do not explain your reasoning to the user. Provide any numerical values back to the user with at most two decimals."}, | ||||
|             {"role": "system", "content": "You are a tools-calling assistant. You express numerical values with at most two decimals."}, | ||||
|             {"role": "user", "content": "What's the y coordinate of a point on the unit sphere at angle 30 degrees?"}, | ||||
|             { | ||||
|                 "role": "assistant", | ||||
| @@ -423,7 +461,8 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         ] | ||||
|         ], | ||||
|         **kwargs, | ||||
|     }, timeout=TIMEOUT_HTTP_REQUEST) | ||||
|     assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" | ||||
|     choice = res.body["choices"][0] | ||||
| @@ -434,19 +473,19 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, | ||||
|     if result_override is not None: | ||||
|         assert re.match(result_override, content), f'Expected {result_override}, got {content}' | ||||
|     else: | ||||
|         assert re.match('^[\\s\\S]*?The (y[ -])?coordinate [\\s\\S]*?is (approximately )?0\\.56\\b|^0\\.56$', content), \ | ||||
|         assert re.match('^[\\s\\S]*?((That\'s|\\bis) (approximately )?)?\\b0\\.(5\\b|56\\b|556)', content), \ | ||||
|             f'Expected something like "The y coordinate is 0.56.", got {content}' | ||||
|  | ||||
|  | ||||
| @pytest.mark.slow | ||||
| @pytest.mark.parametrize("n_predict,reasoning_format,expect_content,expect_reasoning_content,hf_repo,template_override", [ | ||||
|     (128, 'deepseek',  "^The sum of 102 and 7 is 109.*",                        None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None), | ||||
|     (128,  None,        "^The sum of 102 and 7 is 109.*",                       None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None), | ||||
|     (128, 'deepseek',  "^The sum of 102 and 7 is 109[\\s\\S]*",                        None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None), | ||||
|     (128,  None,        "^The sum of 102 and 7 is 109[\\s\\S]*",                       None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None), | ||||
|  | ||||
|     (1024, 'deepseek',  "To find the sum of.*",                                 "I need to calculate the sum of 102 and 7.*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
|     (1024, 'none',      "^I need[\\s\\S]*?</think>\n?To find.*",                None,                                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
|     (1024, 'deepseek',  "To find the sum of[\\s\\S]*",                                 "I need to calculate the sum of 102 and 7[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
|     (1024, 'none',      "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*",                None,                                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
|  | ||||
|     (1024, 'deepseek',  "To find the sum of.*",                                 "First, I [\\s\\S]*",                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), | ||||
|     (1024, 'deepseek',  "To find the sum of[\\s\\S]*",                                 "First, I [\\s\\S]*",                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), | ||||
| ]) | ||||
| def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): | ||||
|     global server | ||||
| @@ -464,7 +503,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | ||||
|     elif isinstance(template_override, str): | ||||
|         server.chat_template = template_override | ||||
|     server.start(timeout_seconds=TIMEOUT_SERVER_START) | ||||
|     res = server.make_request("POST", "/chat/completions", data={ | ||||
|     res = server.make_request("POST", "/v1/chat/completions", data={ | ||||
|         "max_tokens": n_predict, | ||||
|         "messages": [ | ||||
|             {"role": "user", "content": "What's the sum of 102 and 7?"}, | ||||
| @@ -476,7 +515,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | ||||
|  | ||||
|     content = choice["message"].get("content") | ||||
|     if expect_content is None: | ||||
|         assert content is None, f'Expected no content in {choice["message"]}' | ||||
|         assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' | ||||
|     else: | ||||
|         assert re.match(expect_content, content), f'Expected {expect_content}, got {content}' | ||||
|  | ||||
| @@ -488,46 +527,46 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | ||||
|  | ||||
|  | ||||
| @pytest.mark.slow | ||||
| @pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [ | ||||
|     (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
|     # (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"), | ||||
| @pytest.mark.parametrize("hf_repo,template_override", [ | ||||
|     ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), | ||||
|  | ||||
|     (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None), | ||||
|     (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None), | ||||
|     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|  | ||||
|     (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)), | ||||
|     (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"), | ||||
|     ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)), | ||||
|     ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"), | ||||
|  | ||||
|     ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), | ||||
|     (None,                 "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), | ||||
|     # ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), | ||||
|     ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), | ||||
|  | ||||
|     (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)), | ||||
|     (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|     ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)), | ||||
|     ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      None), | ||||
|  | ||||
|     ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)), | ||||
|     (None,                 "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"), | ||||
|     ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)), | ||||
|     ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      None), | ||||
|  | ||||
|     (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None), | ||||
|     (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"), | ||||
|     ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None), | ||||
|     ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"), | ||||
|  | ||||
|     (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), | ||||
|     (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"), | ||||
|     ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), | ||||
|     ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"), | ||||
|  | ||||
|     (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")), | ||||
|     (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"), | ||||
|     ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")), | ||||
|     ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"), | ||||
|  | ||||
|     (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), | ||||
|     (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), | ||||
|     ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), | ||||
|     ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), | ||||
|  | ||||
|     # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. | ||||
|     (None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None), | ||||
|     ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None), | ||||
|     ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              "chatml"), | ||||
| ]) | ||||
| def test_hello_world(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): | ||||
| def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | None] | None): | ||||
|     global server | ||||
|     n_predict = 512 # High because of DeepSeek R1 | ||||
|     server.n_slots = 1 | ||||
|     server.jinja = True | ||||
|     server.n_ctx = 8192 | ||||
|     server.n_predict = 512 # High because of DeepSeek R1 | ||||
|     server.n_predict = n_predict | ||||
|     server.model_hf_repo = hf_repo | ||||
|     server.model_hf_file = None | ||||
|     if isinstance(template_override, tuple): | ||||
| @@ -537,31 +576,28 @@ def test_hello_world(expected_arguments_override: str | None, hf_repo: str, temp | ||||
|     elif isinstance(template_override, str): | ||||
|         server.chat_template = template_override | ||||
|     server.start(timeout_seconds=TIMEOUT_SERVER_START) | ||||
|     res = server.make_request("POST", "/chat/completions", data={ | ||||
|         "max_tokens": 256, | ||||
|  | ||||
|     do_test_hello_world(server, max_tokens=n_predict) | ||||
|  | ||||
|  | ||||
| def do_test_hello_world(server: ServerProcess, **kwargs): | ||||
|     res = server.make_request("POST", "/v1/chat/completions", data={ | ||||
|         "messages": [ | ||||
|             {"role": "system", "content": "You are a coding assistant."}, | ||||
|             {"role": "system", "content": "You are a tool-calling agent."}, | ||||
|             {"role": "user", "content": "say hello world with python"}, | ||||
|         ], | ||||
|         "tools": [PYTHON_TOOL], | ||||
|         # Note: without these greedy params, Functionary v3.2 writes `def hello_world():\n    print("Hello, World!")\nhello_world()` which is correct but a pain to test. | ||||
|         "temperature": 0.0, | ||||
|         "top_k": 1, | ||||
|         "top_p": 1.0, | ||||
|         **kwargs, | ||||
|     }, timeout=TIMEOUT_HTTP_REQUEST) | ||||
|     assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" | ||||
|     choice = res.body["choices"][0] | ||||
|     tool_calls = choice["message"].get("tool_calls") | ||||
|     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' | ||||
|     tool_call = tool_calls[0] | ||||
|     assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' | ||||
|     # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' | ||||
|     assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"] | ||||
|     actual_arguments = tool_call["function"]["arguments"] | ||||
|     if expected_arguments_override is not None: | ||||
|         assert actual_arguments == expected_arguments_override | ||||
|     else: | ||||
|         actual_arguments = json.loads(actual_arguments) | ||||
|         assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}" | ||||
|         code = actual_arguments["code"] | ||||
|         assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}" | ||||
|         assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}' | ||||
|     actual_arguments = json.loads(tool_call["function"]["arguments"]) | ||||
|     assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}" | ||||
|     code = actual_arguments["code"] | ||||
|     assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}" | ||||
|     assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}' | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Olivier Chafik
					Olivier Chafik