mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	server : include usage statistics only when user request them (#16052)
* server : include usage statistics only when user request them
When serving the OpenAI compatible API, we should check if
{"stream_options": {"include_usage": true} is set in the request when
deciding whether we should send usage statistics
closes: #16048
* add unit test
			
			
This commit is contained in:
		 Radoslav Gerganov
					Radoslav Gerganov
				
			
				
					committed by
					
						 GitHub
						GitHub
					
				
			
			
				
	
			
			
			 GitHub
						GitHub
					
				
			
						parent
						
							e58174cecb
						
					
				
				
					commit
					2b6b55a59f
				
			| @@ -111,6 +111,7 @@ static bool server_task_type_need_logits(server_task_type task_type) { | |||||||
|  |  | ||||||
| struct slot_params { | struct slot_params { | ||||||
|     bool stream          = true; |     bool stream          = true; | ||||||
|  |     bool include_usage   = false; | ||||||
|     bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt |     bool cache_prompt    = true; // remember the prompt to avoid reprocessing all prompt | ||||||
|     bool return_tokens   = false; |     bool return_tokens   = false; | ||||||
|     bool return_progress = false; |     bool return_progress = false; | ||||||
| @@ -310,17 +311,19 @@ struct server_task { | |||||||
|         params.verbose           = params_base.verbosity > 9; |         params.verbose           = params_base.verbosity > 9; | ||||||
|         params.timings_per_token = json_value(data, "timings_per_token", false); |         params.timings_per_token = json_value(data, "timings_per_token", false); | ||||||
|  |  | ||||||
|         params.stream           = json_value(data, "stream",             false); |         params.stream           = json_value(data,       "stream",             false); | ||||||
|         params.cache_prompt     = json_value(data, "cache_prompt",       true); |         auto stream_opt         = json_value(data,       "stream_options",     json::object()); | ||||||
|         params.return_tokens    = json_value(data, "return_tokens",      false); |         params.include_usage    = json_value(stream_opt, "include_usage",      false); | ||||||
|         params.return_progress  = json_value(data, "return_progress",    false); |         params.cache_prompt     = json_value(data,       "cache_prompt",       true); | ||||||
|         params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict)); |         params.return_tokens    = json_value(data,       "return_tokens",      false); | ||||||
|         params.n_indent         = json_value(data, "n_indent",           defaults.n_indent); |         params.return_progress  = json_value(data,       "return_progress",    false); | ||||||
|         params.n_keep           = json_value(data, "n_keep",             defaults.n_keep); |         params.n_predict        = json_value(data,       "n_predict",          json_value(data, "max_tokens", defaults.n_predict)); | ||||||
|         params.n_discard        = json_value(data, "n_discard",          defaults.n_discard); |         params.n_indent         = json_value(data,       "n_indent",           defaults.n_indent); | ||||||
|       //params.t_max_prompt_ms  = json_value(data, "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement |         params.n_keep           = json_value(data,       "n_keep",             defaults.n_keep); | ||||||
|         params.t_max_predict_ms = json_value(data, "t_max_predict_ms",   defaults.t_max_predict_ms); |         params.n_discard        = json_value(data,       "n_discard",          defaults.n_discard); | ||||||
|         params.response_fields  = json_value(data, "response_fields",   std::vector<std::string>()); |       //params.t_max_prompt_ms  = json_value(data,       "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement | ||||||
|  |         params.t_max_predict_ms = json_value(data,       "t_max_predict_ms",   defaults.t_max_predict_ms); | ||||||
|  |         params.response_fields  = json_value(data,       "response_fields",   std::vector<std::string>()); | ||||||
|  |  | ||||||
|         params.sampling.top_k              = json_value(data, "top_k",              defaults.sampling.top_k); |         params.sampling.top_k              = json_value(data, "top_k",              defaults.sampling.top_k); | ||||||
|         params.sampling.top_p              = json_value(data, "top_p",              defaults.sampling.top_p); |         params.sampling.top_p              = json_value(data, "top_p",              defaults.sampling.top_p); | ||||||
| @@ -775,6 +778,7 @@ struct server_task_result_cmpl_final : server_task_result { | |||||||
|     llama_tokens tokens; |     llama_tokens tokens; | ||||||
|  |  | ||||||
|     bool stream; |     bool stream; | ||||||
|  |     bool include_usage; | ||||||
|     result_timings timings; |     result_timings timings; | ||||||
|     std::string prompt; |     std::string prompt; | ||||||
|  |  | ||||||
| @@ -982,21 +986,23 @@ struct server_task_result_cmpl_final : server_task_result { | |||||||
|             {"object",             "chat.completion.chunk"}, |             {"object",             "chat.completion.chunk"}, | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage |         if (include_usage) { | ||||||
|         // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices |             // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage | ||||||
|         deltas.push_back({ |             // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices | ||||||
|             {"choices", json::array()}, |             deltas.push_back({ | ||||||
|             {"created",            t}, |                 {"choices", json::array()}, | ||||||
|             {"id",                 oaicompat_cmpl_id}, |                 {"created",            t}, | ||||||
|             {"model",              oaicompat_model}, |                 {"id",                 oaicompat_cmpl_id}, | ||||||
|             {"system_fingerprint", build_info}, |                 {"model",              oaicompat_model}, | ||||||
|             {"object",             "chat.completion.chunk"}, |                 {"system_fingerprint", build_info}, | ||||||
|             {"usage", json { |                 {"object",             "chat.completion.chunk"}, | ||||||
|                 {"completion_tokens", n_decoded}, |                 {"usage", json { | ||||||
|                 {"prompt_tokens",     n_prompt_tokens}, |                     {"completion_tokens", n_decoded}, | ||||||
|                 {"total_tokens",      n_decoded + n_prompt_tokens}, |                     {"prompt_tokens",     n_prompt_tokens}, | ||||||
|             }}, |                     {"total_tokens",      n_decoded + n_prompt_tokens}, | ||||||
|         }); |                 }}, | ||||||
|  |             }); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         if (timings.prompt_n >= 0) { |         if (timings.prompt_n >= 0) { | ||||||
|             deltas.back().push_back({"timings", timings.to_json()}); |             deltas.back().push_back({"timings", timings.to_json()}); | ||||||
| @@ -2815,6 +2821,7 @@ struct server_context { | |||||||
|  |  | ||||||
|         res->verbose               = slot.params.verbose; |         res->verbose               = slot.params.verbose; | ||||||
|         res->stream                = slot.params.stream; |         res->stream                = slot.params.stream; | ||||||
|  |         res->include_usage         = slot.params.include_usage; | ||||||
|         res->oaicompat             = slot.params.oaicompat; |         res->oaicompat             = slot.params.oaicompat; | ||||||
|         res->oaicompat_model       = slot.params.oaicompat_model; |         res->oaicompat_model       = slot.params.oaicompat_model; | ||||||
|         res->oaicompat_cmpl_id     = slot.params.oaicompat_cmpl_id; |         res->oaicompat_cmpl_id     = slot.params.oaicompat_cmpl_id; | ||||||
|   | |||||||
| @@ -271,8 +271,10 @@ def test_chat_completion_with_timings_per_token(): | |||||||
|         "max_tokens": 10, |         "max_tokens": 10, | ||||||
|         "messages": [{"role": "user", "content": "test"}], |         "messages": [{"role": "user", "content": "test"}], | ||||||
|         "stream": True, |         "stream": True, | ||||||
|  |         "stream_options": {"include_usage": True}, | ||||||
|         "timings_per_token": True, |         "timings_per_token": True, | ||||||
|     }) |     }) | ||||||
|  |     stats_received = False | ||||||
|     for i, data in enumerate(res): |     for i, data in enumerate(res): | ||||||
|         if i == 0: |         if i == 0: | ||||||
|             # Check first role message for stream=True |             # Check first role message for stream=True | ||||||
| @@ -288,6 +290,8 @@ def test_chat_completion_with_timings_per_token(): | |||||||
|                 assert "predicted_per_second" in data["timings"] |                 assert "predicted_per_second" in data["timings"] | ||||||
|                 assert "predicted_n" in data["timings"] |                 assert "predicted_n" in data["timings"] | ||||||
|                 assert data["timings"]["predicted_n"] <= 10 |                 assert data["timings"]["predicted_n"] <= 10 | ||||||
|  |                 stats_received = True | ||||||
|  |     assert stats_received | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_logprobs(): | def test_logprobs(): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user