mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-21 12:16:57 +00:00
common : more accurate sampling timing (#17382)
* common : more accurate sampling timing * eval-callback : minor fixes * cont : add time_meas impl * cont : fix log msg [no ci] * cont : fix multiple definitions of time_meas * llama-cli : exclude chat template init from time measurement * cont : print percentage of unaccounted time * cont : do not reset timings
This commit is contained in:
@@ -26,7 +26,6 @@
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
@@ -60,6 +59,14 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
|
||||
|
||||
common_time_meas::~common_time_meas() {
|
||||
if (t_start_us >= 0) {
|
||||
t_acc += ggml_time_us() - t_start_us;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// CPU utils
|
||||
//
|
||||
|
||||
@@ -2,17 +2,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <sstream>
|
||||
#include <cmath>
|
||||
|
||||
#include "ggml-opt.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#define DIRECTORY_SEPARATOR '\\'
|
||||
@@ -30,6 +28,15 @@
|
||||
|
||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
||||
|
||||
struct common_time_meas {
|
||||
common_time_meas(int64_t & t_acc, bool disable = false);
|
||||
~common_time_meas();
|
||||
|
||||
const int64_t t_start_us;
|
||||
|
||||
int64_t & t_acc;
|
||||
};
|
||||
|
||||
struct common_adapter_lora_info {
|
||||
std::string path;
|
||||
float scale;
|
||||
|
||||
@@ -3,9 +3,10 @@
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <unordered_map>
|
||||
|
||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||
// TODO: deduplicate with llama-impl.h
|
||||
@@ -112,6 +113,13 @@ struct common_sampler {
|
||||
|
||||
llama_token_data_array cur_p;
|
||||
|
||||
void reset() {
|
||||
prev.clear();
|
||||
|
||||
llama_sampler_reset(grmr);
|
||||
llama_sampler_reset(chain);
|
||||
}
|
||||
|
||||
void set_logits(struct llama_context * ctx, int idx) {
|
||||
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
|
||||
@@ -128,6 +136,12 @@ struct common_sampler {
|
||||
|
||||
cur_p = { cur.data(), cur.size(), -1, false };
|
||||
}
|
||||
|
||||
common_time_meas tm() {
|
||||
return common_time_meas(t_total_us, params.no_perf);
|
||||
}
|
||||
|
||||
mutable int64_t t_total_us = 0;
|
||||
};
|
||||
|
||||
std::string common_params_sampling::print() const {
|
||||
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
}
|
||||
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
if (accept_grammar) {
|
||||
llama_sampler_accept(gsmpl->grmr, token);
|
||||
}
|
||||
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
|
||||
}
|
||||
|
||||
void common_sampler_reset(struct common_sampler * gsmpl) {
|
||||
llama_sampler_reset(gsmpl->grmr);
|
||||
|
||||
llama_sampler_reset(gsmpl->chain);
|
||||
gsmpl->reset();
|
||||
}
|
||||
|
||||
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
|
||||
// TODO: measure grammar performance
|
||||
|
||||
const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
|
||||
|
||||
llama_perf_sampler_data data_smpl;
|
||||
llama_perf_context_data data_ctx;
|
||||
|
||||
memset(&data_smpl, 0, sizeof(data_smpl));
|
||||
memset(&data_ctx, 0, sizeof(data_ctx));
|
||||
|
||||
if (gsmpl) {
|
||||
llama_perf_sampler_print(gsmpl->chain);
|
||||
auto & data = data_smpl;
|
||||
|
||||
data = llama_perf_sampler(gsmpl->chain);
|
||||
|
||||
// note: the sampling time includes the samplers time + extra time spent in common/sampling
|
||||
LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms);
|
||||
LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
|
||||
}
|
||||
|
||||
if (ctx) {
|
||||
llama_perf_context_print(ctx);
|
||||
auto & data = data_ctx;
|
||||
|
||||
data = llama_perf_context(ctx);
|
||||
|
||||
const double t_end_ms = 1e-3 * ggml_time_us();
|
||||
|
||||
const double t_total_ms = t_end_ms - data.t_start_ms;
|
||||
const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
|
||||
const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms;
|
||||
|
||||
LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
|
||||
LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
|
||||
LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
||||
LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
||||
LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
|
||||
LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
|
||||
|
||||
llama_memory_breakdown_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
auto & grmr = gsmpl->grmr;
|
||||
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||
// helpers
|
||||
|
||||
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
auto * res = &gsmpl->cur_p;
|
||||
|
||||
if (do_sort && !res->sorted) {
|
||||
|
||||
@@ -4,10 +4,10 @@
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
|
||||
/**
|
||||
* This the arbitrary data which will be passed to each callback.
|
||||
@@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
return u.f;
|
||||
}
|
||||
|
||||
static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
|
||||
static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
float v;
|
||||
if (type == GGML_TYPE_F16) {
|
||||
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
|
||||
v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
|
||||
} else if (type == GGML_TYPE_F32) {
|
||||
v = *(float *) &data[i];
|
||||
v = *(const float *) &data[i];
|
||||
} else if (type == GGML_TYPE_I64) {
|
||||
v = (float) *(int64_t *) &data[i];
|
||||
v = (float) *(const int64_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I32) {
|
||||
v = (float) *(int32_t *) &data[i];
|
||||
v = (float) *(const int32_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I16) {
|
||||
v = (float) *(int16_t *) &data[i];
|
||||
v = (float) *(const int16_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_I8) {
|
||||
v = (float) *(int8_t *) &data[i];
|
||||
v = (float) *(const int8_t *) &data[i];
|
||||
} else if (type == GGML_TYPE_BF16) {
|
||||
v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
|
||||
v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
|
||||
time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
|
||||
|
||||
time_meas::~time_meas() {
|
||||
if (t_start_us >= 0) {
|
||||
t_acc += ggml_time_us() - t_start_us;
|
||||
}
|
||||
if (t_start_us >= 0) {
|
||||
t_acc += ggml_time_us() - t_start_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||
ggml_log_set(log_callback, user_data);
|
||||
|
||||
@@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
|
||||
for (auto * smpl : chain->samplers) {
|
||||
llama_sampler_reset(smpl);
|
||||
}
|
||||
|
||||
chain->t_sample_us = 0;
|
||||
chain->n_sample = 0;
|
||||
}
|
||||
|
||||
static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
|
||||
@@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
|
||||
void llama_perf_sampler_print(const struct llama_sampler * chain) {
|
||||
const auto data = llama_perf_sampler(chain);
|
||||
|
||||
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
|
||||
LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
|
||||
}
|
||||
|
||||
void llama_perf_sampler_reset(struct llama_sampler * chain) {
|
||||
@@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
|
||||
|
||||
auto * ctx = (struct llama_sampler_chain *) chain->ctx;
|
||||
|
||||
ctx->t_sample_us = ctx->n_sample = 0;
|
||||
ctx->t_sample_us = 0;
|
||||
ctx->n_sample = 0;
|
||||
}
|
||||
|
||||
@@ -147,11 +147,15 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto * mem = llama_get_memory(ctx);
|
||||
|
||||
llama_memory_t mem = llama_get_memory(ctx);
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
// note: the time for chat template initialization is not negligible:
|
||||
auto chat_templates = common_chat_templates_init(model, params.chat_template);
|
||||
|
||||
// start measuring performance timings from here
|
||||
llama_perf_context_reset(ctx);
|
||||
|
||||
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
|
||||
Reference in New Issue
Block a user