common : more accurate sampling timing (#17382)

* common : more accurate sampling timing

* eval-callback : minor fixes

* cont : add time_meas impl

* cont : fix log msg [no ci]

* cont : fix multiple definitions of time_meas

* llama-cli : exclude chat template init from time measurement

* cont : print percentage of unaccounted time

* cont : do not reset timings
This commit is contained in:
Georgi Gerganov
2025-11-20 13:40:10 +02:00
committed by GitHub
parent 5088b435d4
commit 196f5083ef
7 changed files with 102 additions and 33 deletions

View File

@@ -147,11 +147,15 @@ int main(int argc, char ** argv) {
return 1;
}
auto * mem = llama_get_memory(ctx);
llama_memory_t mem = llama_get_memory(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
// note: the time for chat template initialization is not negligible:
auto chat_templates = common_chat_templates_init(model, params.chat_template);
// start measuring performance timings from here
llama_perf_context_reset(ctx);
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);