mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : add /apply-template endpoint for additional use cases of Minja functionality (#11489)
* add /apply-template endpoint to server * remove unnecessary line * add /apply-template documentation * return only "prompt" field in /apply-template * use suggested idea instead of my overly verbose way
This commit is contained in:
		| @@ -576,6 +576,14 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k | |||||||
|  |  | ||||||
| `tokens`: Set the tokens to detokenize. | `tokens`: Set the tokens to detokenize. | ||||||
|  |  | ||||||
|  | ### POST `/apply-template`: Apply chat template to a conversation | ||||||
|  |  | ||||||
|  | Uses the server's prompt template formatting functionality to convert chat messages to a single string expected by a chat model as input, but does not perform inference. Instead, the prompt string is returned in the `prompt` field of the JSON response. The prompt can then be modified as desired (for example, to insert "Sure!" at the beginning of the model's response) before sending to `/completion` to generate the chat response. | ||||||
|  |  | ||||||
|  | *Options:* | ||||||
|  |  | ||||||
|  | `messages`: (Required) Chat turns in the same format as `/v1/chat/completions`. | ||||||
|  |  | ||||||
| ### POST `/embedding`: Generate embedding of a given text | ### POST `/embedding`: Generate embedding of a given text | ||||||
|  |  | ||||||
| > [!IMPORTANT] | > [!IMPORTANT] | ||||||
|   | |||||||
| @@ -4124,6 +4124,14 @@ int main(int argc, char ** argv) { | |||||||
|         res_ok(res, root); |         res_ok(res, root); | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|  |     const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) { | ||||||
|  |         auto body = json::parse(req.body); | ||||||
|  |         const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default; | ||||||
|  |         json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja); | ||||||
|  |  | ||||||
|  |         res_ok(res, {{ "prompt", data.at("prompt") }}); | ||||||
|  |     }; | ||||||
|  |  | ||||||
|     const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { |     const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { | ||||||
|         handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE); |         handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE); | ||||||
|     }; |     }; | ||||||
| @@ -4300,6 +4308,7 @@ int main(int argc, char ** argv) { | |||||||
|     svr->Post("/v1/reranking",        handle_rerank); |     svr->Post("/v1/reranking",        handle_rerank); | ||||||
|     svr->Post("/tokenize",            handle_tokenize); |     svr->Post("/tokenize",            handle_tokenize); | ||||||
|     svr->Post("/detokenize",          handle_detokenize); |     svr->Post("/detokenize",          handle_detokenize); | ||||||
|  |     svr->Post("/apply-template",      handle_apply_template); | ||||||
|     // LoRA adapters hotswap |     // LoRA adapters hotswap | ||||||
|     svr->Get ("/lora-adapters",       handle_lora_adapters_list); |     svr->Get ("/lora-adapters",       handle_lora_adapters_list); | ||||||
|     svr->Post("/lora-adapters",       handle_lora_adapters_apply); |     svr->Post("/lora-adapters",       handle_lora_adapters_apply); | ||||||
|   | |||||||
| @@ -121,6 +121,21 @@ def test_chat_template(): | |||||||
|     assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" |     assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_apply_chat_template(): | ||||||
|  |     global server | ||||||
|  |     server.chat_template = "command-r" | ||||||
|  |     server.start() | ||||||
|  |     res = server.make_request("POST", "/apply-template", data={ | ||||||
|  |         "messages": [ | ||||||
|  |             {"role": "system", "content": "You are a test."}, | ||||||
|  |             {"role": "user", "content":"Hi there"}, | ||||||
|  |         ] | ||||||
|  |     }) | ||||||
|  |     assert res.status_code == 200 | ||||||
|  |     assert "prompt" in res.body | ||||||
|  |     assert res.body["prompt"] == "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a test.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" | ||||||
|  |  | ||||||
|  |  | ||||||
| @pytest.mark.parametrize("response_format,n_predicted,re_content", [ | @pytest.mark.parametrize("response_format,n_predicted,re_content", [ | ||||||
|     ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""), |     ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""), | ||||||
|     ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"), |     ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"), | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Nigel Bosch
					Nigel Bosch