mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : fix incorrect num_tokens_predicted (#3480)
This commit is contained in:
		| @@ -504,9 +504,11 @@ struct llama_server_context | ||||
|                                            }); | ||||
|         } | ||||
|  | ||||
|         bool tg = true; | ||||
|         while (n_past < embd.size()) | ||||
|         { | ||||
|             int n_eval = (int)embd.size() - n_past; | ||||
|             tg = n_eval == 1; | ||||
|             if (n_eval > params.n_batch) | ||||
|             { | ||||
|                 n_eval = params.n_batch; | ||||
| @@ -633,8 +635,10 @@ struct llama_server_context | ||||
|  | ||||
|             last_n_tokens.erase(last_n_tokens.begin()); | ||||
|             last_n_tokens.push_back(result.tok); | ||||
|             if (tg) { | ||||
|                 num_tokens_predicted++; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // add it to the context | ||||
|         embd.push_back(result.tok); | ||||
| @@ -1124,8 +1128,6 @@ static json format_timings(llama_server_context &llama) | ||||
| { | ||||
|     const auto timings = llama_get_timings(llama.ctx); | ||||
|  | ||||
|     assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted)); | ||||
|  | ||||
|     return json{ | ||||
|         {"prompt_n", timings.n_p_eval}, | ||||
|         {"prompt_ms", timings.t_p_eval_ms}, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jhen-Jie Hong
					Jhen-Jie Hong