mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-10 10:27:03 +00:00
cont : migrate the rest of the inputs out of llama_context
ggml-ci
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -248,24 +248,6 @@ protected:
|
|||||||
|
|
||||||
virtual int64_t n_pos_per_token() const; // vision
|
virtual int64_t n_pos_per_token() const; // vision
|
||||||
|
|
||||||
// when the compute graph is built, it creates the input tensors that it needs
|
|
||||||
// the contents of the input tensors are set by the input_set() function
|
|
||||||
|
|
||||||
// TODO: remove, replace by llama_graph_input_i->set_input()
|
|
||||||
virtual void input_set(const llama_ubatch & ubatch);
|
|
||||||
|
|
||||||
private:
|
|
||||||
// TODO: remove, implement as llama_graph_input_xxx
|
|
||||||
struct {
|
|
||||||
// base input tensors
|
|
||||||
ggml_tensor * pos; // I32 [n_batch]
|
|
||||||
ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
|
|
||||||
ggml_tensor * out_ids; // I32 [n_outputs]
|
|
||||||
ggml_tensor * mean; // F32 [n_batch, n_batch]
|
|
||||||
ggml_tensor * cls; // I32 [n_batch]
|
|
||||||
} inp;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
//
|
//
|
||||||
// output
|
// output
|
||||||
//
|
//
|
||||||
@@ -309,35 +291,35 @@ public:
|
|||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
const char * name,
|
const char * name,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) override;
|
int il) const override;
|
||||||
|
|
||||||
// apply control vector for layer il
|
// apply control vector for layer il
|
||||||
ggml_tensor * build_cvec(
|
ggml_tensor * build_cvec(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
int il) override;
|
int il) const override;
|
||||||
|
|
||||||
// do mat_mul, while optionally apply lora
|
// do mat_mul, while optionally apply lora
|
||||||
ggml_tensor * build_lora_mm(
|
ggml_tensor * build_lora_mm(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * w,
|
ggml_tensor * w,
|
||||||
ggml_tensor * cur) override;
|
ggml_tensor * cur) const override;
|
||||||
|
|
||||||
// do mat_mul_id, while optionally apply lora
|
// do mat_mul_id, while optionally apply lora
|
||||||
ggml_tensor * build_lora_mm_id(
|
ggml_tensor * build_lora_mm_id(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * w, // struct ggml_tensor * as
|
ggml_tensor * w, // struct ggml_tensor * as
|
||||||
ggml_tensor * cur, // struct ggml_tensor * b
|
ggml_tensor * cur, // struct ggml_tensor * b
|
||||||
ggml_tensor * ids) override;
|
ggml_tensor * ids) const override;
|
||||||
|
|
||||||
ggml_tensor * build_rope_factors(int il) override;
|
ggml_tensor * build_rope_factors(int il) const override;
|
||||||
|
|
||||||
ggml_tensor * build_rope_shift(
|
ggml_tensor * build_rope_shift(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * shift,
|
ggml_tensor * shift,
|
||||||
ggml_tensor * factors,
|
ggml_tensor * factors,
|
||||||
ggml_backend_buffer * bbuf) override;
|
ggml_backend_buffer * bbuf) const override;
|
||||||
|
|
||||||
ggml_tensor * build_inp_embd(
|
ggml_tensor * build_inp_embd(
|
||||||
llama_graph_result * res,
|
llama_graph_result * res,
|
||||||
@@ -346,23 +328,28 @@ public:
|
|||||||
const llama_ubatch & ubatch) const override;
|
const llama_ubatch & ubatch) const override;
|
||||||
|
|
||||||
ggml_tensor * build_inp_pos(
|
ggml_tensor * build_inp_pos(
|
||||||
|
llama_graph_result * res,
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
int32_t n_tokens) override;
|
int32_t n_tokens) const override;
|
||||||
|
|
||||||
ggml_tensor * build_inp_pos_bucket(
|
ggml_tensor * build_inp_pos_bucket(
|
||||||
|
llama_graph_result * res,
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
int32_t n_tokens) override;
|
int32_t n_tokens) const override;
|
||||||
|
|
||||||
ggml_tensor * build_inp_out_ids(
|
ggml_tensor * build_inp_out_ids(
|
||||||
ggml_context * ctx0) override;
|
llama_graph_result * res,
|
||||||
|
ggml_context * ctx0) const override;
|
||||||
|
|
||||||
ggml_tensor * build_inp_mean(
|
ggml_tensor * build_inp_mean(
|
||||||
|
llama_graph_result * res,
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
int32_t n_tokens) override;
|
int32_t n_tokens) const override;
|
||||||
|
|
||||||
ggml_tensor * build_inp_cls(
|
ggml_tensor * build_inp_cls(
|
||||||
|
llama_graph_result * res,
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
int32_t n_tokens) override;
|
int32_t n_tokens) const override;
|
||||||
|
|
||||||
llama_graph_input_attn_ptr build_attn_inp(
|
llama_graph_input_attn_ptr build_attn_inp(
|
||||||
llama_graph_result * res,
|
llama_graph_result * res,
|
||||||
@@ -394,18 +381,6 @@ protected:
|
|||||||
bool v_trans,
|
bool v_trans,
|
||||||
float kq_scale) const;
|
float kq_scale) const;
|
||||||
|
|
||||||
virtual ggml_tensor * build_inp_self_k_shift(
|
|
||||||
ggml_context * ctx0);
|
|
||||||
|
|
||||||
virtual void build_kv_self_shift(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
ggml_cgraph * gf);
|
|
||||||
|
|
||||||
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
|
||||||
virtual void build_kv_self_defrag(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
ggml_cgraph * gf);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
//
|
//
|
||||||
// perf
|
// perf
|
||||||
@@ -552,19 +527,6 @@ public:
|
|||||||
int encode(llama_batch & inp_batch) override;
|
int encode(llama_batch & inp_batch) override;
|
||||||
int decode(llama_batch & inp_batch) override;
|
int decode(llama_batch & inp_batch) override;
|
||||||
|
|
||||||
protected:
|
|
||||||
//
|
|
||||||
// input
|
|
||||||
//
|
|
||||||
|
|
||||||
void input_set(const llama_ubatch & ubatch) override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
struct {
|
|
||||||
ggml_tensor * self_pos_bucket; // I32 [n_kv, n_batch]
|
|
||||||
ggml_tensor * self_k_shift; // I32 [kv_size]
|
|
||||||
} inp;
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
//
|
//
|
||||||
// graph
|
// graph
|
||||||
@@ -578,8 +540,9 @@ public:
|
|||||||
//
|
//
|
||||||
|
|
||||||
ggml_tensor * build_inp_pos_bucket(
|
ggml_tensor * build_inp_pos_bucket(
|
||||||
|
llama_graph_result * res,
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
int32_t n_tokens) override;
|
int32_t n_tokens) const override;
|
||||||
|
|
||||||
llama_graph_input_attn_ptr build_attn_inp(
|
llama_graph_input_attn_ptr build_attn_inp(
|
||||||
llama_graph_result * res,
|
llama_graph_result * res,
|
||||||
@@ -600,16 +563,14 @@ public:
|
|||||||
int il) const override;
|
int il) const override;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;
|
llama_graph_result_ptr graph_build_kv_self_shift(
|
||||||
|
|
||||||
void build_kv_self_shift(
|
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_cgraph * gf) override;
|
ggml_cgraph * gf) const;
|
||||||
|
|
||||||
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
||||||
void build_kv_self_defrag(
|
llama_graph_result_ptr graph_build_kv_self_defrag(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_cgraph * gf) override;
|
ggml_cgraph * gf) const;
|
||||||
|
|
||||||
//
|
//
|
||||||
// state save/load
|
// state save/load
|
||||||
@@ -651,19 +612,6 @@ public:
|
|||||||
int encode(llama_batch & inp_batch) override;
|
int encode(llama_batch & inp_batch) override;
|
||||||
int decode(llama_batch & inp_batch) override;
|
int decode(llama_batch & inp_batch) override;
|
||||||
|
|
||||||
protected:
|
|
||||||
//
|
|
||||||
// input
|
|
||||||
//
|
|
||||||
|
|
||||||
void input_set(const llama_ubatch & ubatch) override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
struct {
|
|
||||||
ggml_tensor * s_copy; // I32 [kv_size]
|
|
||||||
ggml_tensor * s_mask; // F32 [1, n_kv]
|
|
||||||
} inp;
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
//
|
//
|
||||||
// graph
|
// graph
|
||||||
@@ -677,10 +625,12 @@ public:
|
|||||||
//
|
//
|
||||||
|
|
||||||
ggml_tensor * build_inp_s_copy(
|
ggml_tensor * build_inp_s_copy(
|
||||||
ggml_context * ctx0) override;
|
llama_graph_result * res,
|
||||||
|
ggml_context * ctx0) const override;
|
||||||
|
|
||||||
ggml_tensor * build_inp_s_mask(
|
ggml_tensor * build_inp_s_mask(
|
||||||
ggml_context * ctx0) override;
|
llama_graph_result * res,
|
||||||
|
ggml_context * ctx0) const override;
|
||||||
|
|
||||||
ggml_tensor * build_copy_mask_state(
|
ggml_tensor * build_copy_mask_state(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
@@ -689,7 +639,7 @@ public:
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
int32_t n_state,
|
int32_t n_state,
|
||||||
int32_t n_seqs) override;
|
int32_t n_seqs) const override;
|
||||||
|
|
||||||
ggml_tensor * build_mamba_layer(
|
ggml_tensor * build_mamba_layer(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
@@ -698,7 +648,7 @@ public:
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) override;
|
int il) const override;
|
||||||
|
|
||||||
ggml_tensor * build_rwkv_token_shift_load(
|
ggml_tensor * build_rwkv_token_shift_load(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
@@ -706,13 +656,13 @@ public:
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) override;
|
int il) const override;
|
||||||
|
|
||||||
ggml_tensor * build_rwkv_token_shift_store(
|
ggml_tensor * build_rwkv_token_shift_store(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * token_shift,
|
ggml_tensor * token_shift,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) override;
|
int il) const override;
|
||||||
|
|
||||||
ggml_tensor * build_rwkv6_time_mix(
|
ggml_tensor * build_rwkv6_time_mix(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
@@ -722,7 +672,7 @@ public:
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) override;
|
int il) const override;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
//
|
//
|
||||||
@@ -774,18 +724,6 @@ public:
|
|||||||
protected:
|
protected:
|
||||||
void reserve() override;
|
void reserve() override;
|
||||||
|
|
||||||
//
|
|
||||||
// input
|
|
||||||
//
|
|
||||||
|
|
||||||
void input_set(const llama_ubatch & ubatch) override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
struct {
|
|
||||||
ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
|
|
||||||
} inp;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
//
|
//
|
||||||
// graph
|
// graph
|
||||||
//
|
//
|
||||||
@@ -793,7 +731,8 @@ protected:
|
|||||||
ggml_cgraph * graph_init() override;
|
ggml_cgraph * graph_init() override;
|
||||||
|
|
||||||
ggml_tensor * build_inp_cross_embd(
|
ggml_tensor * build_inp_cross_embd(
|
||||||
ggml_context * ctx0) override;
|
llama_graph_result * res,
|
||||||
|
ggml_context * ctx0) const override;
|
||||||
|
|
||||||
llama_graph_input_attn_ptr build_attn_inp(
|
llama_graph_input_attn_ptr build_attn_inp(
|
||||||
llama_graph_result * res,
|
llama_graph_result * res,
|
||||||
|
|||||||
@@ -68,25 +68,19 @@ ggml_tensor * llama_graph_i::build_attn_cross(
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llama_graph_i::build_inp_cross_embd(
|
ggml_tensor * llama_graph_i::build_inp_cross_embd(
|
||||||
ggml_context * ctx0) {
|
llama_graph_result * res,
|
||||||
|
ggml_context * ctx0) const {
|
||||||
|
GGML_UNUSED(res);
|
||||||
GGML_UNUSED(ctx0);
|
GGML_UNUSED(ctx0);
|
||||||
|
|
||||||
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
|
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llama_graph_i::build_inp_cross_kq_mask(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
int32_t n_tokens) {
|
|
||||||
GGML_UNUSED(ctx0);
|
|
||||||
GGML_UNUSED(n_tokens);
|
|
||||||
|
|
||||||
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_tensor * llama_graph_i::build_inp_s_copy (
|
ggml_tensor * llama_graph_i::build_inp_s_copy (
|
||||||
ggml_context * ctx0) {
|
llama_graph_result * res,
|
||||||
|
ggml_context * ctx0) const {
|
||||||
|
GGML_UNUSED(res);
|
||||||
GGML_UNUSED(ctx0);
|
GGML_UNUSED(ctx0);
|
||||||
|
|
||||||
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
|
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
|
||||||
@@ -95,7 +89,9 @@ ggml_tensor * llama_graph_i::build_inp_s_copy (
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llama_graph_i::build_inp_s_mask(
|
ggml_tensor * llama_graph_i::build_inp_s_mask(
|
||||||
ggml_context * ctx0) {
|
llama_graph_result * res,
|
||||||
|
ggml_context * ctx0) const {
|
||||||
|
GGML_UNUSED(res);
|
||||||
GGML_UNUSED(ctx0);
|
GGML_UNUSED(ctx0);
|
||||||
|
|
||||||
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
|
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
|
||||||
@@ -110,7 +106,7 @@ ggml_tensor * llama_graph_i::build_copy_mask_state(
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
int32_t n_state,
|
int32_t n_state,
|
||||||
int32_t n_seqs) {
|
int32_t n_seqs) const {
|
||||||
GGML_UNUSED(ctx0);
|
GGML_UNUSED(ctx0);
|
||||||
GGML_UNUSED(gf);
|
GGML_UNUSED(gf);
|
||||||
GGML_UNUSED(s);
|
GGML_UNUSED(s);
|
||||||
@@ -131,7 +127,7 @@ ggml_tensor * llama_graph_i::build_mamba_layer(
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) {
|
int il) const {
|
||||||
GGML_UNUSED(ctx0);
|
GGML_UNUSED(ctx0);
|
||||||
GGML_UNUSED(gf);
|
GGML_UNUSED(gf);
|
||||||
GGML_UNUSED(cur);
|
GGML_UNUSED(cur);
|
||||||
@@ -151,7 +147,7 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_load(
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) {
|
int il) const {
|
||||||
GGML_UNUSED(ctx0);
|
GGML_UNUSED(ctx0);
|
||||||
GGML_UNUSED(gf);
|
GGML_UNUSED(gf);
|
||||||
GGML_UNUSED(state_copy);
|
GGML_UNUSED(state_copy);
|
||||||
@@ -168,7 +164,7 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_store(
|
|||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * token_shift,
|
ggml_tensor * token_shift,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) {
|
int il) const {
|
||||||
GGML_UNUSED(ctx0);
|
GGML_UNUSED(ctx0);
|
||||||
GGML_UNUSED(token_shift);
|
GGML_UNUSED(token_shift);
|
||||||
GGML_UNUSED(ubatch);
|
GGML_UNUSED(ubatch);
|
||||||
@@ -187,7 +183,7 @@ ggml_tensor * llama_graph_i::build_rwkv6_time_mix(
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) {
|
int il) const {
|
||||||
GGML_UNUSED(ctx0);
|
GGML_UNUSED(ctx0);
|
||||||
GGML_UNUSED(gf);
|
GGML_UNUSED(gf);
|
||||||
GGML_UNUSED(cur);
|
GGML_UNUSED(cur);
|
||||||
|
|||||||
@@ -93,6 +93,7 @@ public:
|
|||||||
//
|
//
|
||||||
|
|
||||||
// TODO: can become more granular in the future
|
// TODO: can become more granular in the future
|
||||||
|
// TODO: move all methods that do not require things from llama_context to llm_build_context
|
||||||
class llama_graph_i {
|
class llama_graph_i {
|
||||||
public:
|
public:
|
||||||
llama_graph_i(llama_graph_type type);
|
llama_graph_i(llama_graph_type type);
|
||||||
@@ -109,28 +110,28 @@ public:
|
|||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
const char * name,
|
const char * name,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) = 0;
|
int il) const = 0;
|
||||||
|
|
||||||
// apply control vector for layer il
|
// apply control vector for layer il
|
||||||
virtual ggml_tensor * build_cvec(
|
virtual ggml_tensor * build_cvec(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
int il) = 0;
|
int il) const = 0;
|
||||||
|
|
||||||
// do mat_mul, while optionally apply lora
|
// do mat_mul, while optionally apply lora
|
||||||
virtual ggml_tensor * build_lora_mm(
|
virtual ggml_tensor * build_lora_mm(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * w,
|
ggml_tensor * w,
|
||||||
ggml_tensor * cur) = 0;
|
ggml_tensor * cur) const = 0;
|
||||||
|
|
||||||
// do mat_mul_id, while optionally apply lora
|
// do mat_mul_id, while optionally apply lora
|
||||||
virtual ggml_tensor * build_lora_mm_id(
|
virtual ggml_tensor * build_lora_mm_id(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * w, // struct ggml_tensor * as
|
ggml_tensor * w, // struct ggml_tensor * as
|
||||||
ggml_tensor * cur, // struct ggml_tensor * b
|
ggml_tensor * cur, // struct ggml_tensor * b
|
||||||
ggml_tensor * ids) = 0;
|
ggml_tensor * ids) const = 0;
|
||||||
|
|
||||||
virtual ggml_tensor * build_rope_factors(int il) = 0;
|
virtual ggml_tensor * build_rope_factors(int il) const = 0;
|
||||||
|
|
||||||
// note: optionally set the backend to be the same as the bbuf's backend
|
// note: optionally set the backend to be the same as the bbuf's backend
|
||||||
virtual ggml_tensor * build_rope_shift(
|
virtual ggml_tensor * build_rope_shift(
|
||||||
@@ -138,7 +139,7 @@ public:
|
|||||||
ggml_tensor * cur,
|
ggml_tensor * cur,
|
||||||
ggml_tensor * shift,
|
ggml_tensor * shift,
|
||||||
ggml_tensor * factors,
|
ggml_tensor * factors,
|
||||||
ggml_backend_buffer * bbuf) = 0;
|
ggml_backend_buffer * bbuf) const = 0;
|
||||||
|
|
||||||
// graph build API (context-specific)
|
// graph build API (context-specific)
|
||||||
|
|
||||||
@@ -146,26 +147,31 @@ public:
|
|||||||
llama_graph_result * res,
|
llama_graph_result * res,
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * tok_embd,
|
ggml_tensor * tok_embd,
|
||||||
const llama_ubatch & ubatch) const = 0; // note these methods will become const, i.e. they don't mutate the llama_context that implements them
|
const llama_ubatch & ubatch) const = 0;
|
||||||
|
|
||||||
virtual ggml_tensor * build_inp_pos(
|
virtual ggml_tensor * build_inp_pos(
|
||||||
|
llama_graph_result * res,
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
int32_t n_tokens) = 0;
|
int32_t n_tokens) const = 0;
|
||||||
|
|
||||||
virtual ggml_tensor * build_inp_pos_bucket(
|
virtual ggml_tensor * build_inp_pos_bucket(
|
||||||
|
llama_graph_result * res,
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
int32_t n_tokens) = 0;
|
int32_t n_tokens) const = 0;
|
||||||
|
|
||||||
virtual ggml_tensor * build_inp_out_ids(
|
virtual ggml_tensor * build_inp_out_ids(
|
||||||
ggml_context * ctx0) = 0;
|
llama_graph_result * res,
|
||||||
|
ggml_context * ctx0) const = 0;
|
||||||
|
|
||||||
virtual ggml_tensor * build_inp_mean(
|
virtual ggml_tensor * build_inp_mean(
|
||||||
|
llama_graph_result * res,
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
int32_t n_tokens) = 0;
|
int32_t n_tokens) const = 0;
|
||||||
|
|
||||||
virtual ggml_tensor * build_inp_cls(
|
virtual ggml_tensor * build_inp_cls(
|
||||||
|
llama_graph_result * res,
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
int32_t n_tokens) = 0;
|
int32_t n_tokens) const = 0;
|
||||||
|
|
||||||
virtual llama_graph_input_attn_ptr build_attn_inp(
|
virtual llama_graph_input_attn_ptr build_attn_inp(
|
||||||
llama_graph_result * res,
|
llama_graph_result * res,
|
||||||
@@ -197,17 +203,16 @@ public:
|
|||||||
int il) const;
|
int il) const;
|
||||||
|
|
||||||
virtual ggml_tensor * build_inp_cross_embd(
|
virtual ggml_tensor * build_inp_cross_embd(
|
||||||
ggml_context * ctx0);
|
llama_graph_result * res,
|
||||||
|
ggml_context * ctx0) const;
|
||||||
virtual ggml_tensor * build_inp_cross_kq_mask(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
int32_t n_tokens);
|
|
||||||
|
|
||||||
virtual ggml_tensor * build_inp_s_copy(
|
virtual ggml_tensor * build_inp_s_copy(
|
||||||
ggml_context * ctx0);
|
llama_graph_result * res,
|
||||||
|
ggml_context * ctx0) const;
|
||||||
|
|
||||||
virtual ggml_tensor * build_inp_s_mask(
|
virtual ggml_tensor * build_inp_s_mask(
|
||||||
ggml_context * ctx0);
|
llama_graph_result * res,
|
||||||
|
ggml_context * ctx0) const;
|
||||||
|
|
||||||
virtual ggml_tensor * build_copy_mask_state(
|
virtual ggml_tensor * build_copy_mask_state(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
@@ -216,7 +221,7 @@ public:
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
int32_t n_state,
|
int32_t n_state,
|
||||||
int32_t n_seqs);
|
int32_t n_seqs) const;
|
||||||
|
|
||||||
virtual ggml_tensor * build_mamba_layer(
|
virtual ggml_tensor * build_mamba_layer(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
@@ -225,7 +230,7 @@ public:
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il);
|
int il) const;
|
||||||
|
|
||||||
virtual ggml_tensor * build_rwkv_token_shift_load(
|
virtual ggml_tensor * build_rwkv_token_shift_load(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
@@ -233,13 +238,13 @@ public:
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il);
|
int il) const;
|
||||||
|
|
||||||
virtual ggml_tensor * build_rwkv_token_shift_store(
|
virtual ggml_tensor * build_rwkv_token_shift_store(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * token_shift,
|
ggml_tensor * token_shift,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il);
|
int il) const;
|
||||||
|
|
||||||
virtual ggml_tensor * build_rwkv6_time_mix(
|
virtual ggml_tensor * build_rwkv6_time_mix(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
@@ -249,5 +254,5 @@ public:
|
|||||||
ggml_tensor * state_copy,
|
ggml_tensor * state_copy,
|
||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il);
|
int il) const;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -3910,7 +3910,7 @@ struct llm_build_context {
|
|||||||
|
|
||||||
// TODO: tmp
|
// TODO: tmp
|
||||||
struct ggml_tensor * build_inp_pos() {
|
struct ggml_tensor * build_inp_pos() {
|
||||||
ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens);
|
ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens);
|
||||||
cb(cur, "inp_pos", -1);
|
cb(cur, "inp_pos", -1);
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
@@ -3918,7 +3918,7 @@ struct llm_build_context {
|
|||||||
|
|
||||||
// TODO: tmp
|
// TODO: tmp
|
||||||
struct ggml_tensor * build_inp_out_ids() {
|
struct ggml_tensor * build_inp_out_ids() {
|
||||||
ggml_tensor * cur = lgf->build_inp_out_ids(ctx0);
|
ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0);
|
||||||
cb(cur, "inp_out_ids", -1);
|
cb(cur, "inp_out_ids", -1);
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
@@ -3926,7 +3926,7 @@ struct llm_build_context {
|
|||||||
|
|
||||||
// TODO: tmp
|
// TODO: tmp
|
||||||
struct ggml_tensor * build_inp_mean() {
|
struct ggml_tensor * build_inp_mean() {
|
||||||
ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens);
|
ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens);
|
||||||
cb(cur, "inp_mean", -1);
|
cb(cur, "inp_mean", -1);
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
@@ -3934,7 +3934,7 @@ struct llm_build_context {
|
|||||||
|
|
||||||
// TODO: tmp
|
// TODO: tmp
|
||||||
struct ggml_tensor * build_inp_cls() {
|
struct ggml_tensor * build_inp_cls() {
|
||||||
ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens);
|
ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens);
|
||||||
cb(cur, "inp_cls", -1);
|
cb(cur, "inp_cls", -1);
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
@@ -3957,7 +3957,7 @@ struct llm_build_context {
|
|||||||
|
|
||||||
// TODO: tmp
|
// TODO: tmp
|
||||||
struct ggml_tensor * build_pos_bucket() {
|
struct ggml_tensor * build_pos_bucket() {
|
||||||
ggml_tensor * cur = lgf->build_inp_pos_bucket(ctx0, n_tokens);
|
ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens);
|
||||||
cb(cur, "pos_bucket", -1);
|
cb(cur, "pos_bucket", -1);
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
@@ -3965,20 +3965,12 @@ struct llm_build_context {
|
|||||||
|
|
||||||
// TODO: tmp
|
// TODO: tmp
|
||||||
struct ggml_tensor * build_inp_cross_embd() {
|
struct ggml_tensor * build_inp_cross_embd() {
|
||||||
ggml_tensor * cur = lgf->build_inp_cross_embd(ctx0);
|
ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0);
|
||||||
cb(cur, "embd_enc", -1);
|
cb(cur, "embd_enc", -1);
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: tmp
|
|
||||||
struct ggml_tensor * build_inp_cross_kq_mask() {
|
|
||||||
ggml_tensor * cur = lgf->build_inp_cross_kq_mask(ctx0, n_tokens);
|
|
||||||
cb(cur, "KQ_mask_cross", -1);
|
|
||||||
|
|
||||||
return cur;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * build_norm(
|
struct ggml_tensor * build_norm(
|
||||||
struct ggml_tensor * cur,
|
struct ggml_tensor * cur,
|
||||||
struct ggml_tensor * mw,
|
struct ggml_tensor * mw,
|
||||||
@@ -3987,7 +3979,7 @@ struct llm_build_context {
|
|||||||
int il) {
|
int il) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break;
|
case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break;
|
||||||
case LLM_NORM_RMS: cur = ggml_rms_norm (ctx0, cur, hparams.f_norm_rms_eps); break;
|
case LLM_NORM_RMS: cur = ggml_rms_norm(ctx0, cur, hparams.f_norm_rms_eps); break;
|
||||||
case LLM_NORM_GROUP:
|
case LLM_NORM_GROUP:
|
||||||
{
|
{
|
||||||
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
|
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
|
||||||
@@ -8070,8 +8062,8 @@ struct llm_build_context {
|
|||||||
// {n_embd, n_tokens}
|
// {n_embd, n_tokens}
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
|
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
|
||||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
|
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
// norm
|
// norm
|
||||||
@@ -10443,8 +10435,8 @@ struct llm_build_context {
|
|||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
||||||
|
|
||||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
|
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
|
||||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
|
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
|
||||||
|
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
@@ -10535,8 +10527,8 @@ struct llm_build_context {
|
|||||||
|
|
||||||
inpL = build_inp_embd(model.tok_embd);
|
inpL = build_inp_embd(model.tok_embd);
|
||||||
|
|
||||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
|
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
|
||||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
|
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
|
||||||
|
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||||
|
|||||||
Reference in New Issue
Block a user