common : more accurate sampling timing (#17382)

* common : more accurate sampling timing * eval-callback : minor fixes * cont : add time_meas impl * cont : fix log msg [no ci] * cont : fix multiple definitions of time_meas * llama-cli : exclude chat template init from time measurement * cont : print percentage of unaccounted time * cont : do not reset timings
2025-11-21 12:16:57 +00:00 · 2025-11-20 13:40:10 +02:00
parent 5088b435d4
commit 196f5083ef
7 changed files with 102 additions and 33 deletions
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -147,11 +147,15 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    auto * mem = llama_get_memory(ctx);
-
+    llama_memory_t mem = llama_get_memory(ctx);
    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    // note: the time for chat template initialization is not negligible:
    auto chat_templates = common_chat_templates_init(model, params.chat_template);

+    // start measuring performance timings from here
+    llama_perf_context_reset(ctx);
+
    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);

    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);