mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : export max observed n_past value (#15361)
Add tracking for high watermark cache usage and make it available in /metrics endpoint. Use-case: Tracking largest needed cache usage under realistic workload to better understand memory requirements and be able to adjust cache size/quantization for model/cache accordingly.
This commit is contained in:
		 Oleksandr Kuvshynov
					Oleksandr Kuvshynov
				
			
				
					committed by
					
						 GitHub
						GitHub
					
				
			
			
				
	
			
			
			 GitHub
						GitHub
					
				
			
						parent
						
							21c17b5bef
						
					
				
				
					commit
					e5155e6986
				
			| @@ -1201,6 +1201,8 @@ struct server_task_result_metrics : server_task_result { | ||||
|     uint64_t n_tokens_predicted_total        = 0; | ||||
|     uint64_t t_tokens_generation_total       = 0; | ||||
|  | ||||
|     uint64_t n_past_max = 0; | ||||
|  | ||||
|     uint64_t n_prompt_tokens_processed = 0; | ||||
|     uint64_t t_prompt_processing       = 0; | ||||
|  | ||||
| @@ -1226,6 +1228,8 @@ struct server_task_result_metrics : server_task_result { | ||||
|             { "n_tokens_predicted_total",        n_tokens_predicted_total }, | ||||
|             { "t_prompt_processing_total",       t_prompt_processing_total }, | ||||
|  | ||||
|             { "n_past_max",                      n_past_max }, | ||||
|  | ||||
|             { "n_prompt_tokens_processed",       n_prompt_tokens_processed }, | ||||
|             { "t_prompt_processing",             t_prompt_processing }, | ||||
|             { "n_tokens_predicted",              n_tokens_predicted }, | ||||
| @@ -1587,6 +1591,8 @@ struct server_metrics { | ||||
|     uint64_t n_tokens_predicted_total        = 0; | ||||
|     uint64_t t_tokens_generation_total       = 0; | ||||
|  | ||||
|     uint64_t n_past_max = 0; | ||||
|  | ||||
|     uint64_t n_prompt_tokens_processed = 0; | ||||
|     uint64_t t_prompt_processing       = 0; | ||||
|  | ||||
| @@ -1605,6 +1611,10 @@ struct server_metrics { | ||||
|         n_prompt_tokens_processed       += slot.n_prompt_tokens_processed; | ||||
|         t_prompt_processing             += slot.t_prompt_processing; | ||||
|         t_prompt_processing_total       += slot.t_prompt_processing; | ||||
|  | ||||
|         if (slot.n_past > 0) { | ||||
|             n_past_max = std::max(n_past_max, (uint64_t) slot.n_past); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     void on_prediction(const server_slot & slot) { | ||||
| @@ -1620,6 +1630,9 @@ struct server_metrics { | ||||
|             if (slot.is_processing()) { | ||||
|                 n_busy_slots_total++; | ||||
|             } | ||||
|             if (slot.n_past > 0) { | ||||
|                 n_past_max = std::max(n_past_max, (uint64_t) slot.n_past); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -2875,6 +2888,8 @@ struct server_context { | ||||
|                     res->n_tokens_predicted_total        = metrics.n_tokens_predicted_total; | ||||
|                     res->t_tokens_generation_total       = metrics.t_tokens_generation_total; | ||||
|  | ||||
|                     res->n_past_max = metrics.n_past_max; | ||||
|  | ||||
|                     res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed; | ||||
|                     res->t_prompt_processing       = metrics.t_prompt_processing; | ||||
|                     res->n_tokens_predicted        = metrics.n_tokens_predicted; | ||||
| @@ -4077,6 +4092,10 @@ int main(int argc, char ** argv) { | ||||
|                     {"name",  "n_decode_total"}, | ||||
|                     {"help",  "Total number of llama_decode() calls"}, | ||||
|                     {"value",  res_metrics->n_decode_total} | ||||
|             }, { | ||||
|                     {"name",  "n_past_max"}, | ||||
|                     {"help",  "Largest observed n_past."}, | ||||
|                     {"value",  res_metrics->n_past_max} | ||||
|             }, { | ||||
|                     {"name",  "n_busy_slots_per_decode"}, | ||||
|                     {"help",  "Average number of busy slots per llama_decode() call"}, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user