common : more accurate sampling timing (#17382)

* common : more accurate sampling timing * eval-callback : minor fixes * cont : add time_meas impl * cont : fix log msg [no ci] * cont : fix multiple definitions of time_meas * llama-cli : exclude chat template init from time measurement * cont : print percentage of unaccounted time * cont : do not reset timings
2025-11-21 12:16:57 +00:00 · 2025-11-20 13:40:10 +02:00
parent 5088b435d4
commit 196f5083ef
7 changed files with 102 additions and 33 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -26,7 +26,6 @@
 #include <sstream>
 #include <string>
 #include <thread>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -60,6 +59,14 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
 common_time_meas::~common_time_meas() {
    if (t_start_us >= 0) {
        t_acc += ggml_time_us() - t_start_us;
    }
 }
 //
 // CPU utils
 //
--- a/common/common.h
+++ b/common/common.h
@@ -2,17 +2,15 @@
 #pragma once
 #include "ggml-opt.h"
 #include "llama-cpp.h"
 #include <set>
 #include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <map>
 #include <sstream>
 #include <cmath>
 #include "ggml-opt.h"
 #include "llama-cpp.h"
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -30,6 +28,15 @@
 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
 struct common_time_meas {
    common_time_meas(int64_t & t_acc, bool disable = false);
    ~common_time_meas();
    const int64_t t_start_us;
    int64_t & t_acc;
 };
 struct common_adapter_lora_info {
    std::string path;
    float scale;
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -3,9 +3,10 @@
 #include "common.h"
 #include "log.h"
 #include <cmath>
 #include <unordered_map>
 #include <algorithm>
 #include <cmath>
 #include <cstring>
 #include <unordered_map>
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
@@ -112,6 +113,13 @@ struct common_sampler {
    llama_token_data_array cur_p;
    void reset() {
        prev.clear();
        llama_sampler_reset(grmr);
        llama_sampler_reset(chain);
    }
    void set_logits(struct llama_context * ctx, int idx) {
        const auto * logits = llama_get_logits_ith(ctx, idx);
@@ -128,6 +136,12 @@ struct common_sampler {
        cur_p = { cur.data(), cur.size(), -1, false };
    }
    common_time_meas tm() {
        return common_time_meas(t_total_us, params.no_perf);
    }
    mutable int64_t t_total_us = 0;
 };
 std::string common_params_sampling::print() const {
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 }
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    const auto tm = gsmpl->tm();
    if (accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
    }
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
 }
 void common_sampler_reset(struct common_sampler * gsmpl) {
-    llama_sampler_reset(gsmpl->grmr);
+    gsmpl->reset();
    llama_sampler_reset(gsmpl->chain);
 }
 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
    // TODO: measure grammar performance
    const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
    llama_perf_sampler_data data_smpl;
    llama_perf_context_data data_ctx;
    memset(&data_smpl, 0, sizeof(data_smpl));
    memset(&data_ctx,  0, sizeof(data_ctx));
    if (gsmpl) {
-        llama_perf_sampler_print(gsmpl->chain);
+        auto & data = data_smpl;
        data = llama_perf_sampler(gsmpl->chain);
        // note: the sampling time includes the samplers time + extra time spent in common/sampling
        LOG_INF("%s:    sampling time = %10.2f ms\n", __func__, t_sampling_ms);
        LOG_INF("%s:    samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
    }
    if (ctx) {
-        llama_perf_context_print(ctx);
+        auto & data = data_ctx;
        data = llama_perf_context(ctx);
        const double t_end_ms = 1e-3 * ggml_time_us();
        const double t_total_ms = t_end_ms - data.t_start_ms;
        const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
        const double t_unacc_pc = 100.0 * t_unacc_ms /  t_total_ms;
        LOG_INF("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
        LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
                __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
        LOG_INF("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
                __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
        LOG_INF("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
        LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %%      (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
        LOG_INF("%s:    graphs reused = %10d\n", __func__, data.n_reused);
        llama_memory_breakdown_print(ctx);
    }
 }
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
    llama_synchronize(ctx);
    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
    const auto tm = gsmpl->tm();
    gsmpl->set_logits(ctx, idx);
    auto & grmr  = gsmpl->grmr;
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
 // helpers
 llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
    const auto tm = gsmpl->tm();
    auto * res = &gsmpl->cur_p;
    if (do_sort && !res->sorted) {
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -4,10 +4,10 @@
 #include "llama.h"
 #include "ggml.h"
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <numeric>
 /**
 * This the arbitrary data which will be passed to each callback.
@@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
    return u.f;
 }
-static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
+static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
    float v;
    if (type == GGML_TYPE_F16) {
-        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
    } else if (type == GGML_TYPE_F32) {
-        v = *(float *) &data[i];
+        v = *(const float *) &data[i];
    } else if (type == GGML_TYPE_I64) {
-        v = (float) *(int64_t *) &data[i];
+        v = (float) *(const int64_t *) &data[i];
    } else if (type == GGML_TYPE_I32) {
-        v = (float) *(int32_t *) &data[i];
+        v = (float) *(const int32_t *) &data[i];
    } else if (type == GGML_TYPE_I16) {
-        v = (float) *(int16_t *) &data[i];
+        v = (float) *(const int16_t *) &data[i];
    } else if (type == GGML_TYPE_I8) {
-        v = (float) *(int8_t *) &data[i];
+        v = (float) *(const int8_t *) &data[i];
    } else if (type == GGML_TYPE_BF16) {
-        v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
+        v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
    } else {
        GGML_ABORT("fatal error");
    }
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@@ -23,7 +23,7 @@ time_meas::~time_meas() {
    if (t_start_us >= 0) {
        t_acc += ggml_time_us() - t_start_us;
    }
-    }
+}
 void llama_log_set(ggml_log_callback log_callback, void * user_data) {
    ggml_log_set(log_callback, user_data);
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
    for (auto * smpl : chain->samplers) {
        llama_sampler_reset(smpl);
    }
    chain->t_sample_us = 0;
    chain->n_sample    = 0;
 }
 static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
@@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
 void llama_perf_sampler_print(const struct llama_sampler * chain) {
    const auto data = llama_perf_sampler(chain);
-    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+    LLAMA_LOG_INFO("%s:    samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
 }
 void llama_perf_sampler_reset(struct llama_sampler * chain) {
@@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
    auto * ctx = (struct llama_sampler_chain *) chain->ctx;
-    ctx->t_sample_us = ctx->n_sample = 0;
+    ctx->t_sample_us = 0;
    ctx->n_sample    = 0;
 }
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@@ -147,11 +147,15 @@ int main(int argc, char ** argv) {
        return 1;
    }
-    auto * mem = llama_get_memory(ctx);
+    llama_memory_t mem = llama_get_memory(ctx);
    const llama_vocab * vocab = llama_model_get_vocab(model);
    // note: the time for chat template initialization is not negligible:
    auto chat_templates = common_chat_templates_init(model, params.chat_template);
    // start measuring performance timings from here
    llama_perf_context_reset(ctx);
    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
    auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);