diff --git a/tools/server/README.md b/tools/server/README.md index 6828ef7382..8fd478eb32 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -512,7 +512,7 @@ These words will not be included in the completion, so make sure to add them to `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` -`return_progress`: Include prompt processing progress in `stream` mode. The progress will be contained inside `prompt_progress` with 3 values: `total`, `cache` and `processed`. The overall progress is `processed/total`, while the actual timed progress is `(processed-cache)/(total-cache)`. Default: `false` +`return_progress`: Include prompt processing progress in `stream` mode. The progress will be contained inside `prompt_progress` with 4 values: `total`, `cache`, `processed`, and `time_ms`. The overall progress is `processed/total`, while the actual timed progress is `(processed-cache)/(total-cache)`. The `time_ms` field contains the elapsed time in milliseconds since prompt processing started. Default: `false` `post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain. diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 164e8cf4e7..9d91e32d1f 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3078,7 +3078,7 @@ struct server_context { res->progress.total = slot.task->n_tokens(); res->progress.cache = slot.n_prompt_tokens_cache; res->progress.processed = slot.prompt.tokens.size(); - res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt / 1000); + res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt) / 1000; } else { res->content = tkn.text_to_send; res->tokens = { tkn.tok };