cont : migrate the rest of the inputs out of llama_context

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-02-28 18:01:25 +02:00
parent 7f02ee562e
commit 9cab53c7dd
5 changed files with 645 additions and 577 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -248,24 +248,6 @@ protected:
virtual int64_t n_pos_per_token() const; // vision
// when the compute graph is built, it creates the input tensors that it needs
// the contents of the input tensors are set by the input_set() function
// TODO: remove, replace by llama_graph_input_i->set_input()
virtual void input_set(const llama_ubatch & ubatch);
private:
// TODO: remove, implement as llama_graph_input_xxx
struct {
// base input tensors
ggml_tensor * pos; // I32 [n_batch]
ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
ggml_tensor * out_ids; // I32 [n_outputs]
ggml_tensor * mean; // F32 [n_batch, n_batch]
ggml_tensor * cls; // I32 [n_batch]
} inp;
protected:
//
// output
//
@@ -309,35 +291,35 @@ public:
ggml_tensor * cur,
const char * name,
const llama_ubatch & ubatch,
int il) override;
int il) const override;
// apply control vector for layer il
ggml_tensor * build_cvec(
ggml_context * ctx0,
ggml_tensor * cur,
int il) override;
int il) const override;
// do mat_mul, while optionally apply lora
ggml_tensor * build_lora_mm(
ggml_context * ctx0,
ggml_tensor * w,
ggml_tensor * cur) override;
ggml_tensor * cur) const override;
// do mat_mul_id, while optionally apply lora
ggml_tensor * build_lora_mm_id(
ggml_context * ctx0,
ggml_tensor * w, // struct ggml_tensor * as
ggml_tensor * cur, // struct ggml_tensor * b
ggml_tensor * ids) override;
ggml_tensor * ids) const override;
ggml_tensor * build_rope_factors(int il) override;
ggml_tensor * build_rope_factors(int il) const override;
ggml_tensor * build_rope_shift(
ggml_context * ctx0,
ggml_tensor * cur,
ggml_tensor * shift,
ggml_tensor * factors,
ggml_backend_buffer * bbuf) override;
ggml_backend_buffer * bbuf) const override;
ggml_tensor * build_inp_embd(
llama_graph_result * res,
@@ -346,23 +328,28 @@ public:
const llama_ubatch & ubatch) const override;
ggml_tensor * build_inp_pos(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) override;
int32_t n_tokens) const override;
ggml_tensor * build_inp_pos_bucket(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) override;
int32_t n_tokens) const override;
ggml_tensor * build_inp_out_ids(
ggml_context * ctx0) override;
llama_graph_result * res,
ggml_context * ctx0) const override;
ggml_tensor * build_inp_mean(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) override;
int32_t n_tokens) const override;
ggml_tensor * build_inp_cls(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) override;
int32_t n_tokens) const override;
llama_graph_input_attn_ptr build_attn_inp(
llama_graph_result * res,
@@ -394,18 +381,6 @@ protected:
bool v_trans,
float kq_scale) const;
virtual ggml_tensor * build_inp_self_k_shift(
ggml_context * ctx0);
virtual void build_kv_self_shift(
ggml_context * ctx0,
ggml_cgraph * gf);
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
virtual void build_kv_self_defrag(
ggml_context * ctx0,
ggml_cgraph * gf);
public:
//
// perf
@@ -552,19 +527,6 @@ public:
int encode(llama_batch & inp_batch) override;
int decode(llama_batch & inp_batch) override;
protected:
//
// input
//
void input_set(const llama_ubatch & ubatch) override;
private:
struct {
ggml_tensor * self_pos_bucket; // I32 [n_kv, n_batch]
ggml_tensor * self_k_shift; // I32 [kv_size]
} inp;
protected:
//
// graph
@@ -578,8 +540,9 @@ public:
//
ggml_tensor * build_inp_pos_bucket(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) override;
int32_t n_tokens) const override;
llama_graph_input_attn_ptr build_attn_inp(
llama_graph_result * res,
@@ -600,16 +563,14 @@ public:
int il) const override;
protected:
ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;
void build_kv_self_shift(
llama_graph_result_ptr graph_build_kv_self_shift(
ggml_context * ctx0,
ggml_cgraph * gf) override;
ggml_cgraph * gf) const;
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
void build_kv_self_defrag(
llama_graph_result_ptr graph_build_kv_self_defrag(
ggml_context * ctx0,
ggml_cgraph * gf) override;
ggml_cgraph * gf) const;
//
// state save/load
@@ -651,19 +612,6 @@ public:
int encode(llama_batch & inp_batch) override;
int decode(llama_batch & inp_batch) override;
protected:
//
// input
//
void input_set(const llama_ubatch & ubatch) override;
private:
struct {
ggml_tensor * s_copy; // I32 [kv_size]
ggml_tensor * s_mask; // F32 [1, n_kv]
} inp;
protected:
//
// graph
@@ -677,10 +625,12 @@ public:
//
ggml_tensor * build_inp_s_copy(
ggml_context * ctx0) override;
llama_graph_result * res,
ggml_context * ctx0) const override;
ggml_tensor * build_inp_s_mask(
ggml_context * ctx0) override;
llama_graph_result * res,
ggml_context * ctx0) const override;
ggml_tensor * build_copy_mask_state(
ggml_context * ctx0,
@@ -689,7 +639,7 @@ public:
ggml_tensor * state_copy,
ggml_tensor * state_mask,
int32_t n_state,
int32_t n_seqs) override;
int32_t n_seqs) const override;
ggml_tensor * build_mamba_layer(
ggml_context * ctx0,
@@ -698,7 +648,7 @@ public:
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il) override;
int il) const override;
ggml_tensor * build_rwkv_token_shift_load(
ggml_context * ctx0,
@@ -706,13 +656,13 @@ public:
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il) override;
int il) const override;
ggml_tensor * build_rwkv_token_shift_store(
ggml_context * ctx0,
ggml_tensor * token_shift,
const llama_ubatch & ubatch,
int il) override;
int il) const override;
ggml_tensor * build_rwkv6_time_mix(
ggml_context * ctx0,
@@ -722,7 +672,7 @@ public:
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il) override;
int il) const override;
protected:
//
@@ -774,18 +724,6 @@ public:
protected:
void reserve() override;
//
// input
//
void input_set(const llama_ubatch & ubatch) override;
private:
struct {
ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
} inp;
protected:
//
// graph
//
@@ -793,7 +731,8 @@ protected:
ggml_cgraph * graph_init() override;
ggml_tensor * build_inp_cross_embd(
ggml_context * ctx0) override;
llama_graph_result * res,
ggml_context * ctx0) const override;
llama_graph_input_attn_ptr build_attn_inp(
llama_graph_result * res,

View File

@@ -68,25 +68,19 @@ ggml_tensor * llama_graph_i::build_attn_cross(
}
ggml_tensor * llama_graph_i::build_inp_cross_embd(
ggml_context * ctx0) {
llama_graph_result * res,
ggml_context * ctx0) const {
GGML_UNUSED(res);
GGML_UNUSED(ctx0);
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
return nullptr;
}
ggml_tensor * llama_graph_i::build_inp_cross_kq_mask(
ggml_context * ctx0,
int32_t n_tokens) {
GGML_UNUSED(ctx0);
GGML_UNUSED(n_tokens);
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
return nullptr;
}
ggml_tensor * llama_graph_i::build_inp_s_copy (
ggml_context * ctx0) {
llama_graph_result * res,
ggml_context * ctx0) const {
GGML_UNUSED(res);
GGML_UNUSED(ctx0);
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
@@ -95,7 +89,9 @@ ggml_tensor * llama_graph_i::build_inp_s_copy (
}
ggml_tensor * llama_graph_i::build_inp_s_mask(
ggml_context * ctx0) {
llama_graph_result * res,
ggml_context * ctx0) const {
GGML_UNUSED(res);
GGML_UNUSED(ctx0);
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
@@ -104,13 +100,13 @@ ggml_tensor * llama_graph_i::build_inp_s_mask(
}
ggml_tensor * llama_graph_i::build_copy_mask_state(
ggml_context * ctx0,
ggml_cgraph * gf,
ggml_tensor * s,
ggml_tensor * state_copy,
ggml_tensor * state_mask,
int32_t n_state,
int32_t n_seqs) {
ggml_context * ctx0,
ggml_cgraph * gf,
ggml_tensor * s,
ggml_tensor * state_copy,
ggml_tensor * state_mask,
int32_t n_state,
int32_t n_seqs) const {
GGML_UNUSED(ctx0);
GGML_UNUSED(gf);
GGML_UNUSED(s);
@@ -125,13 +121,13 @@ ggml_tensor * llama_graph_i::build_copy_mask_state(
}
ggml_tensor * llama_graph_i::build_mamba_layer(
ggml_context * ctx0,
ggml_cgraph * gf,
ggml_tensor * cur,
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il) {
ggml_context * ctx0,
ggml_cgraph * gf,
ggml_tensor * cur,
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il) const {
GGML_UNUSED(ctx0);
GGML_UNUSED(gf);
GGML_UNUSED(cur);
@@ -146,12 +142,12 @@ ggml_tensor * llama_graph_i::build_mamba_layer(
}
ggml_tensor * llama_graph_i::build_rwkv_token_shift_load(
ggml_context * ctx0,
ggml_cgraph * gf,
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il) {
ggml_context * ctx0,
ggml_cgraph * gf,
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il) const {
GGML_UNUSED(ctx0);
GGML_UNUSED(gf);
GGML_UNUSED(state_copy);
@@ -165,10 +161,10 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_load(
}
ggml_tensor * llama_graph_i::build_rwkv_token_shift_store(
ggml_context * ctx0,
ggml_tensor * token_shift,
const llama_ubatch & ubatch,
int il) {
ggml_context * ctx0,
ggml_tensor * token_shift,
const llama_ubatch & ubatch,
int il) const {
GGML_UNUSED(ctx0);
GGML_UNUSED(token_shift);
GGML_UNUSED(ubatch);
@@ -180,14 +176,14 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_store(
}
ggml_tensor * llama_graph_i::build_rwkv6_time_mix(
ggml_context * ctx0,
ggml_cgraph * gf,
ggml_tensor * cur,
ggml_tensor * x_prev,
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il) {
ggml_context * ctx0,
ggml_cgraph * gf,
ggml_tensor * cur,
ggml_tensor * x_prev,
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il) const {
GGML_UNUSED(ctx0);
GGML_UNUSED(gf);
GGML_UNUSED(cur);

View File

@@ -93,6 +93,7 @@ public:
//
// TODO: can become more granular in the future
// TODO: move all methods that do not require things from llama_context to llm_build_context
class llama_graph_i {
public:
llama_graph_i(llama_graph_type type);
@@ -109,28 +110,28 @@ public:
ggml_tensor * cur,
const char * name,
const llama_ubatch & ubatch,
int il) = 0;
int il) const = 0;
// apply control vector for layer il
virtual ggml_tensor * build_cvec(
ggml_context * ctx0,
ggml_tensor * cur,
int il) = 0;
int il) const = 0;
// do mat_mul, while optionally apply lora
virtual ggml_tensor * build_lora_mm(
ggml_context * ctx0,
ggml_tensor * w,
ggml_tensor * cur) = 0;
ggml_tensor * cur) const = 0;
// do mat_mul_id, while optionally apply lora
virtual ggml_tensor * build_lora_mm_id(
ggml_context * ctx0,
ggml_tensor * w, // struct ggml_tensor * as
ggml_tensor * cur, // struct ggml_tensor * b
ggml_tensor * ids) = 0;
ggml_tensor * ids) const = 0;
virtual ggml_tensor * build_rope_factors(int il) = 0;
virtual ggml_tensor * build_rope_factors(int il) const = 0;
// note: optionally set the backend to be the same as the bbuf's backend
virtual ggml_tensor * build_rope_shift(
@@ -138,7 +139,7 @@ public:
ggml_tensor * cur,
ggml_tensor * shift,
ggml_tensor * factors,
ggml_backend_buffer * bbuf) = 0;
ggml_backend_buffer * bbuf) const = 0;
// graph build API (context-specific)
@@ -146,26 +147,31 @@ public:
llama_graph_result * res,
ggml_context * ctx0,
ggml_tensor * tok_embd,
const llama_ubatch & ubatch) const = 0; // note these methods will become const, i.e. they don't mutate the llama_context that implements them
const llama_ubatch & ubatch) const = 0;
virtual ggml_tensor * build_inp_pos(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) = 0;
int32_t n_tokens) const = 0;
virtual ggml_tensor * build_inp_pos_bucket(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) = 0;
int32_t n_tokens) const = 0;
virtual ggml_tensor * build_inp_out_ids(
ggml_context * ctx0) = 0;
llama_graph_result * res,
ggml_context * ctx0) const = 0;
virtual ggml_tensor * build_inp_mean(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) = 0;
int32_t n_tokens) const = 0;
virtual ggml_tensor * build_inp_cls(
llama_graph_result * res,
ggml_context * ctx0,
int32_t n_tokens) = 0;
int32_t n_tokens) const = 0;
virtual llama_graph_input_attn_ptr build_attn_inp(
llama_graph_result * res,
@@ -197,17 +203,16 @@ public:
int il) const;
virtual ggml_tensor * build_inp_cross_embd(
ggml_context * ctx0);
virtual ggml_tensor * build_inp_cross_kq_mask(
ggml_context * ctx0,
int32_t n_tokens);
llama_graph_result * res,
ggml_context * ctx0) const;
virtual ggml_tensor * build_inp_s_copy(
ggml_context * ctx0);
llama_graph_result * res,
ggml_context * ctx0) const;
virtual ggml_tensor * build_inp_s_mask(
ggml_context * ctx0);
llama_graph_result * res,
ggml_context * ctx0) const;
virtual ggml_tensor * build_copy_mask_state(
ggml_context * ctx0,
@@ -216,7 +221,7 @@ public:
ggml_tensor * state_copy,
ggml_tensor * state_mask,
int32_t n_state,
int32_t n_seqs);
int32_t n_seqs) const;
virtual ggml_tensor * build_mamba_layer(
ggml_context * ctx0,
@@ -225,7 +230,7 @@ public:
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il);
int il) const;
virtual ggml_tensor * build_rwkv_token_shift_load(
ggml_context * ctx0,
@@ -233,13 +238,13 @@ public:
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il);
int il) const;
virtual ggml_tensor * build_rwkv_token_shift_store(
ggml_context * ctx0,
ggml_tensor * token_shift,
const llama_ubatch & ubatch,
int il);
int il) const;
virtual ggml_tensor * build_rwkv6_time_mix(
ggml_context * ctx0,
@@ -249,5 +254,5 @@ public:
ggml_tensor * state_copy,
ggml_tensor * state_mask,
const llama_ubatch & ubatch,
int il);
int il) const;
};

View File

@@ -3910,7 +3910,7 @@ struct llm_build_context {
// TODO: tmp
struct ggml_tensor * build_inp_pos() {
ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens);
ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens);
cb(cur, "inp_pos", -1);
return cur;
@@ -3918,7 +3918,7 @@ struct llm_build_context {
// TODO: tmp
struct ggml_tensor * build_inp_out_ids() {
ggml_tensor * cur = lgf->build_inp_out_ids(ctx0);
ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0);
cb(cur, "inp_out_ids", -1);
return cur;
@@ -3926,7 +3926,7 @@ struct llm_build_context {
// TODO: tmp
struct ggml_tensor * build_inp_mean() {
ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens);
ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens);
cb(cur, "inp_mean", -1);
return cur;
@@ -3934,7 +3934,7 @@ struct llm_build_context {
// TODO: tmp
struct ggml_tensor * build_inp_cls() {
ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens);
ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens);
cb(cur, "inp_cls", -1);
return cur;
@@ -3957,7 +3957,7 @@ struct llm_build_context {
// TODO: tmp
struct ggml_tensor * build_pos_bucket() {
ggml_tensor * cur = lgf->build_inp_pos_bucket(ctx0, n_tokens);
ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens);
cb(cur, "pos_bucket", -1);
return cur;
@@ -3965,20 +3965,12 @@ struct llm_build_context {
// TODO: tmp
struct ggml_tensor * build_inp_cross_embd() {
ggml_tensor * cur = lgf->build_inp_cross_embd(ctx0);
ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0);
cb(cur, "embd_enc", -1);
return cur;
}
// TODO: tmp
struct ggml_tensor * build_inp_cross_kq_mask() {
ggml_tensor * cur = lgf->build_inp_cross_kq_mask(ctx0, n_tokens);
cb(cur, "KQ_mask_cross", -1);
return cur;
}
struct ggml_tensor * build_norm(
struct ggml_tensor * cur,
struct ggml_tensor * mw,
@@ -3986,8 +3978,8 @@ struct llm_build_context {
llm_norm_type type,
int il) {
switch (type) {
case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break;
case LLM_NORM_RMS: cur = ggml_rms_norm (ctx0, cur, hparams.f_norm_rms_eps); break;
case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break;
case LLM_NORM_RMS: cur = ggml_rms_norm(ctx0, cur, hparams.f_norm_rms_eps); break;
case LLM_NORM_GROUP:
{
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
@@ -8070,8 +8062,8 @@ struct llm_build_context {
// {n_embd, n_tokens}
inpL = build_inp_embd(model.tok_embd);
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
for (int il = 0; il < n_layer; ++il) {
// norm
@@ -10443,8 +10435,8 @@ struct llm_build_context {
inpL = build_inp_embd(model.tok_embd);
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
const auto n_embd = hparams.n_embd;
const auto n_seq_tokens = ubatch.n_seq_tokens;
@@ -10535,8 +10527,8 @@ struct llm_build_context {
inpL = build_inp_embd(model.tok_embd);
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
const auto n_embd = hparams.n_embd;
const auto n_seq_tokens = ubatch.n_seq_tokens;