mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	server: Add "tokens per second" information in the backend (#10548)
* add cmake rvv support * add timings * remove space * update readme * fix * fix code * remove empty line * add test --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
		| @@ -133,6 +133,7 @@ struct common_params_sampling { | |||||||
|     bool    penalize_nl        = false; // consider newlines as a repeatable token |     bool    penalize_nl        = false; // consider newlines as a repeatable token | ||||||
|     bool    ignore_eos         = false; |     bool    ignore_eos         = false; | ||||||
|     bool    no_perf            = false; // disable performance metrics |     bool    no_perf            = false; // disable performance metrics | ||||||
|  |     bool    timing_per_token   = false; | ||||||
|  |  | ||||||
|     std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY |     std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY | ||||||
|  |  | ||||||
|   | |||||||
| @@ -416,6 +416,8 @@ node index.js | |||||||
|  |  | ||||||
|     `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. |     `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. | ||||||
|  |  | ||||||
|  |     `timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false` | ||||||
|  |  | ||||||
| **Response format** | **Response format** | ||||||
|  |  | ||||||
| - Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. | - Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. | ||||||
|   | |||||||
| @@ -177,6 +177,8 @@ struct server_slot { | |||||||
|     bool stopped_word   = false; |     bool stopped_word   = false; | ||||||
|     bool stopped_limit  = false; |     bool stopped_limit  = false; | ||||||
|  |  | ||||||
|  |     bool timings_per_token = false; | ||||||
|  |  | ||||||
|     bool oaicompat = false; |     bool oaicompat = false; | ||||||
|  |  | ||||||
|     std::string oaicompat_model; |     std::string oaicompat_model; | ||||||
| @@ -882,6 +884,8 @@ struct server_context { | |||||||
|             slot.oaicompat_model = ""; |             slot.oaicompat_model = ""; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         slot.timings_per_token       = json_value(data, "timings_per_token",  false); | ||||||
|  |  | ||||||
|         slot.params.stream           = json_value(data, "stream",             false); |         slot.params.stream           = json_value(data, "stream",             false); | ||||||
|         slot.params.cache_prompt     = json_value(data, "cache_prompt",       true); |         slot.params.cache_prompt     = json_value(data, "cache_prompt",       true); | ||||||
|         slot.params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict)); |         slot.params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict)); | ||||||
| @@ -1279,6 +1283,7 @@ struct server_context { | |||||||
|             {"speculative.n_max",         slot.params.speculative.n_max}, |             {"speculative.n_max",         slot.params.speculative.n_max}, | ||||||
|             {"speculative.n_min",         slot.params.speculative.n_min}, |             {"speculative.n_min",         slot.params.speculative.n_min}, | ||||||
|             {"speculative.p_min",         slot.params.speculative.p_min}, |             {"speculative.p_min",         slot.params.speculative.p_min}, | ||||||
|  |             {"timings_per_token",         slot.timings_per_token}, | ||||||
|         }; |         }; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -1336,6 +1341,10 @@ struct server_context { | |||||||
|             res.data["model"] = slot.oaicompat_model; |             res.data["model"] = slot.oaicompat_model; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         if (slot.timings_per_token) { | ||||||
|  |             res.data["timings"] = slot.get_formated_timings(); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         queue_results.send(res); |         queue_results.send(res); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -2274,12 +2283,17 @@ struct server_context { | |||||||
|                 common_sampler_accept(slot.smpl, id, true); |                 common_sampler_accept(slot.smpl, id, true); | ||||||
|  |  | ||||||
|                 slot.n_decoded += 1; |                 slot.n_decoded += 1; | ||||||
|  |  | ||||||
|  |                 const int64_t t_current = ggml_time_us(); | ||||||
|  |  | ||||||
|                 if (slot.n_decoded == 1) { |                 if (slot.n_decoded == 1) { | ||||||
|                     slot.t_start_generation = ggml_time_us(); |                     slot.t_start_generation = t_current; | ||||||
|                     slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; |                     slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; | ||||||
|                     metrics.on_prompt_eval(slot); |                     metrics.on_prompt_eval(slot); | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|  |                 slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3; | ||||||
|  |  | ||||||
|                 completion_token_output result; |                 completion_token_output result; | ||||||
|                 result.tok = id; |                 result.tok = id; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -146,3 +146,20 @@ def test_invalid_chat_completion_req(messages): | |||||||
|     }) |     }) | ||||||
|     assert res.status_code == 400 or res.status_code == 500 |     assert res.status_code == 400 or res.status_code == 500 | ||||||
|     assert "error" in res.body |     assert "error" in res.body | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_chat_completion_with_timings_per_token(): | ||||||
|  |     global server | ||||||
|  |     server.start() | ||||||
|  |     res = server.make_stream_request("POST", "/chat/completions", data={ | ||||||
|  |         "max_tokens": 10, | ||||||
|  |         "messages": [{"role": "user", "content": "test"}], | ||||||
|  |         "stream": True, | ||||||
|  |         "timings_per_token": True, | ||||||
|  |     }) | ||||||
|  |     for data in res: | ||||||
|  |         assert "timings" in data | ||||||
|  |         assert "prompt_per_second" in data["timings"] | ||||||
|  |         assert "predicted_per_second" in data["timings"] | ||||||
|  |         assert "predicted_n" in data["timings"] | ||||||
|  |         assert data["timings"]["predicted_n"] <= 10 | ||||||
|   | |||||||
| @@ -650,6 +650,10 @@ static json format_final_response_oaicompat(const json & request, const json & r | |||||||
|         res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); |         res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     if (result.contains("timings")) { | ||||||
|  |         res.push_back({"timings", json_value(result, "timings", json::object())}); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     return res; |     return res; | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -740,6 +744,11 @@ static std::vector<json> format_partial_response_oaicompat(const json & result, | |||||||
|         {"model",   modelname}, |         {"model",   modelname}, | ||||||
|         {"object",  "chat.completion.chunk"} |         {"object",  "chat.completion.chunk"} | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|  |     if (result.contains("timings")) { | ||||||
|  |         ret.push_back({"timings", json_value(result, "timings", json::object())}); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     if (!finish_reason.empty()) { |     if (!finish_reason.empty()) { | ||||||
|         int num_tokens_predicted = json_value(result, "tokens_predicted", 0); |         int num_tokens_predicted = json_value(result, "tokens_predicted", 0); | ||||||
|         int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0); |         int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 haopeng
					haopeng