mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-07 09:57:00 +00:00
graph : add llama_graph_result
ggml-ci
This commit is contained in:
@@ -246,31 +246,48 @@ void llama_context::init() {
|
|||||||
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||||
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||||
|
|
||||||
|
int n_splits_pp = -1;
|
||||||
|
int n_nodes_pp = -1;
|
||||||
|
|
||||||
|
int n_splits_tg = -1;
|
||||||
|
int n_nodes_tg = -1;
|
||||||
|
|
||||||
// reserve pp graph first so that buffers are only allocated once
|
// reserve pp graph first so that buffers are only allocated once
|
||||||
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
{
|
||||||
ggml_cgraph * gf_pp = build_graph(ubatch_pp, true);
|
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||||
if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
|
auto res_pp = graph_build(ubatch_pp, true);
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
|
auto & gf_pp = res_pp.gf;
|
||||||
throw std::runtime_error("failed to allocate compute buffers");
|
if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
|
||||||
|
throw std::runtime_error("failed to allocate compute buffers");
|
||||||
|
}
|
||||||
|
|
||||||
|
n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
|
||||||
|
n_nodes_pp = ggml_graph_n_nodes(gf_pp);
|
||||||
}
|
}
|
||||||
int n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
|
|
||||||
int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
|
|
||||||
|
|
||||||
// reserve with tg graph to get the number of splits and nodes
|
// reserve with tg graph to get the number of splits and nodes
|
||||||
llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
{
|
||||||
ggml_cgraph * gf_tg = build_graph(ubatch_tg, true);
|
llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||||
if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) {
|
auto res_tg = graph_build(ubatch_tg, true);
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
|
auto & gf_tg = res_tg.gf;
|
||||||
throw std::runtime_error("failed to allocate compute buffers");
|
if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
|
||||||
|
throw std::runtime_error("failed to allocate compute buffers");
|
||||||
|
}
|
||||||
|
n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
|
||||||
|
n_nodes_tg = ggml_graph_n_nodes(gf_tg);
|
||||||
}
|
}
|
||||||
int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
|
|
||||||
int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
|
|
||||||
|
|
||||||
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
|
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
|
||||||
gf_pp = build_graph(ubatch_pp, true);
|
{
|
||||||
if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
|
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
|
auto res_pp = graph_build(ubatch_pp, true);
|
||||||
throw std::runtime_error("failed to allocate compute buffers");
|
auto & gf_pp = res_pp.gf;
|
||||||
|
if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
|
||||||
|
throw std::runtime_error("failed to allocate compute buffers");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
for (size_t i = 0; i < backend_ptrs.size(); ++i) {
|
||||||
@@ -890,7 +907,7 @@ void llama_context::build_cb(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cgraph * llama_context::build_graph(const llama_ubatch & ubatch, bool worst_case) {
|
llama_graph_result llama_context::graph_build(const llama_ubatch & ubatch, bool worst_case) {
|
||||||
return model.build_graph(*this, cparams, ubatch, graph_init(), worst_case);
|
return model.build_graph(*this, cparams, ubatch, graph_init(), worst_case);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1814,11 +1831,11 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
|
|||||||
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||||
llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||||
|
|
||||||
ggml_cgraph * gf = build_graph(ubatch, true);
|
auto res = graph_build(ubatch, true);
|
||||||
|
|
||||||
// initialize scheduler with the worst-case graph
|
// initialize scheduler with the worst-case graph
|
||||||
ggml_backend_sched_reset(sched.get());
|
ggml_backend_sched_reset(sched.get());
|
||||||
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
if (!ggml_backend_sched_reserve(sched.get(), res.gf)) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1828,7 +1845,9 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
|
|||||||
ggml_backend_sched_reset(sched.get());
|
ggml_backend_sched_reset(sched.get());
|
||||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||||
|
|
||||||
ggml_cgraph * gf = build_graph(ubatch, false);
|
auto res = graph_build(ubatch, false);
|
||||||
|
|
||||||
|
auto & gf = res.gf;
|
||||||
|
|
||||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||||
|
|
||||||
@@ -2073,7 +2092,9 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
|
|||||||
ggml_backend_sched_reset(sched.get());
|
ggml_backend_sched_reset(sched.get());
|
||||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||||
|
|
||||||
ggml_cgraph * gf = build_graph(ubatch, false);
|
auto res = graph_build(ubatch, false);
|
||||||
|
|
||||||
|
auto & gf = res.gf;
|
||||||
|
|
||||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||||
|
|
||||||
|
|||||||
@@ -95,6 +95,9 @@ struct llama_context : public llama_graph_i {
|
|||||||
// zero-out inputs and create ggml_context
|
// zero-out inputs and create ggml_context
|
||||||
virtual ggml_context_ptr graph_init();
|
virtual ggml_context_ptr graph_init();
|
||||||
|
|
||||||
|
// TODO: add encode/decode graphs
|
||||||
|
virtual llama_graph_result graph_build(const llama_ubatch & ubatch, bool worst_case);
|
||||||
|
|
||||||
// returns the result of ggml_backend_sched_graph_compute_async execution
|
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||||
virtual enum ggml_status graph_compute(
|
virtual enum ggml_status graph_compute(
|
||||||
ggml_cgraph * graph,
|
ggml_cgraph * graph,
|
||||||
@@ -145,9 +148,6 @@ struct llama_context : public llama_graph_i {
|
|||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il);
|
int il);
|
||||||
|
|
||||||
// TODO: add encode/decode graphs
|
|
||||||
virtual ggml_cgraph * build_graph(const llama_ubatch & ubatch, bool worst_case);
|
|
||||||
|
|
||||||
// apply control vector for layer il
|
// apply control vector for layer il
|
||||||
virtual ggml_tensor * build_cvec(
|
virtual ggml_tensor * build_cvec(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
|
|||||||
@@ -10,6 +10,13 @@ struct ggml_context;
|
|||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
struct llama_ubatch;
|
struct llama_ubatch;
|
||||||
|
|
||||||
|
struct llama_graph_result {
|
||||||
|
ggml_cgraph * gf = nullptr;
|
||||||
|
|
||||||
|
ggml_tensor * t_logits = nullptr;
|
||||||
|
ggml_tensor * t_embd = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: can become more granular in the future
|
// TODO: can become more granular in the future
|
||||||
class llama_graph_i {
|
class llama_graph_i {
|
||||||
public:
|
public:
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -16,6 +16,7 @@ class llama_graph_i;
|
|||||||
struct llama_cparams;
|
struct llama_cparams;
|
||||||
struct llama_ubatch;
|
struct llama_ubatch;
|
||||||
struct llama_model_loader;
|
struct llama_model_loader;
|
||||||
|
struct llama_graph_result;
|
||||||
|
|
||||||
// available models
|
// available models
|
||||||
enum llm_type {
|
enum llm_type {
|
||||||
@@ -368,8 +369,7 @@ struct llama_model {
|
|||||||
const struct ggml_tensor * get_tensor(const char * name) const;
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
||||||
|
|
||||||
// TODO: add encode/decode graphs
|
// TODO: add encode/decode graphs
|
||||||
// TODO: return a struct containing the graph and the output tensors, such as logits, embeddings, etc.
|
llama_graph_result build_graph(
|
||||||
ggml_cgraph * build_graph(
|
|
||||||
llama_graph_i & lgf,
|
llama_graph_i & lgf,
|
||||||
const llama_cparams & cparams,
|
const llama_cparams & cparams,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
|
|||||||
Reference in New Issue
Block a user