mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-10 10:27:03 +00:00
cont : migrate the rest of the inputs out of llama_context
ggml-ci
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -248,24 +248,6 @@ protected:
|
||||
|
||||
virtual int64_t n_pos_per_token() const; // vision
|
||||
|
||||
// when the compute graph is built, it creates the input tensors that it needs
|
||||
// the contents of the input tensors are set by the input_set() function
|
||||
|
||||
// TODO: remove, replace by llama_graph_input_i->set_input()
|
||||
virtual void input_set(const llama_ubatch & ubatch);
|
||||
|
||||
private:
|
||||
// TODO: remove, implement as llama_graph_input_xxx
|
||||
struct {
|
||||
// base input tensors
|
||||
ggml_tensor * pos; // I32 [n_batch]
|
||||
ggml_tensor * pos_bucket; // I32 [n_batch, n_batch]
|
||||
ggml_tensor * out_ids; // I32 [n_outputs]
|
||||
ggml_tensor * mean; // F32 [n_batch, n_batch]
|
||||
ggml_tensor * cls; // I32 [n_batch]
|
||||
} inp;
|
||||
|
||||
protected:
|
||||
//
|
||||
// output
|
||||
//
|
||||
@@ -309,35 +291,35 @@ public:
|
||||
ggml_tensor * cur,
|
||||
const char * name,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) override;
|
||||
int il) const override;
|
||||
|
||||
// apply control vector for layer il
|
||||
ggml_tensor * build_cvec(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
int il) override;
|
||||
int il) const override;
|
||||
|
||||
// do mat_mul, while optionally apply lora
|
||||
ggml_tensor * build_lora_mm(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * w,
|
||||
ggml_tensor * cur) override;
|
||||
ggml_tensor * cur) const override;
|
||||
|
||||
// do mat_mul_id, while optionally apply lora
|
||||
ggml_tensor * build_lora_mm_id(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * w, // struct ggml_tensor * as
|
||||
ggml_tensor * cur, // struct ggml_tensor * b
|
||||
ggml_tensor * ids) override;
|
||||
ggml_tensor * ids) const override;
|
||||
|
||||
ggml_tensor * build_rope_factors(int il) override;
|
||||
ggml_tensor * build_rope_factors(int il) const override;
|
||||
|
||||
ggml_tensor * build_rope_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
ggml_backend_buffer * bbuf) override;
|
||||
ggml_backend_buffer * bbuf) const override;
|
||||
|
||||
ggml_tensor * build_inp_embd(
|
||||
llama_graph_result * res,
|
||||
@@ -346,23 +328,28 @@ public:
|
||||
const llama_ubatch & ubatch) const override;
|
||||
|
||||
ggml_tensor * build_inp_pos(
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens) override;
|
||||
int32_t n_tokens) const override;
|
||||
|
||||
ggml_tensor * build_inp_pos_bucket(
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens) override;
|
||||
int32_t n_tokens) const override;
|
||||
|
||||
ggml_tensor * build_inp_out_ids(
|
||||
ggml_context * ctx0) override;
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0) const override;
|
||||
|
||||
ggml_tensor * build_inp_mean(
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens) override;
|
||||
int32_t n_tokens) const override;
|
||||
|
||||
ggml_tensor * build_inp_cls(
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens) override;
|
||||
int32_t n_tokens) const override;
|
||||
|
||||
llama_graph_input_attn_ptr build_attn_inp(
|
||||
llama_graph_result * res,
|
||||
@@ -394,18 +381,6 @@ protected:
|
||||
bool v_trans,
|
||||
float kq_scale) const;
|
||||
|
||||
virtual ggml_tensor * build_inp_self_k_shift(
|
||||
ggml_context * ctx0);
|
||||
|
||||
virtual void build_kv_self_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf);
|
||||
|
||||
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
||||
virtual void build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf);
|
||||
|
||||
public:
|
||||
//
|
||||
// perf
|
||||
@@ -552,19 +527,6 @@ public:
|
||||
int encode(llama_batch & inp_batch) override;
|
||||
int decode(llama_batch & inp_batch) override;
|
||||
|
||||
protected:
|
||||
//
|
||||
// input
|
||||
//
|
||||
|
||||
void input_set(const llama_ubatch & ubatch) override;
|
||||
|
||||
private:
|
||||
struct {
|
||||
ggml_tensor * self_pos_bucket; // I32 [n_kv, n_batch]
|
||||
ggml_tensor * self_k_shift; // I32 [kv_size]
|
||||
} inp;
|
||||
|
||||
protected:
|
||||
//
|
||||
// graph
|
||||
@@ -578,8 +540,9 @@ public:
|
||||
//
|
||||
|
||||
ggml_tensor * build_inp_pos_bucket(
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens) override;
|
||||
int32_t n_tokens) const override;
|
||||
|
||||
llama_graph_input_attn_ptr build_attn_inp(
|
||||
llama_graph_result * res,
|
||||
@@ -600,16 +563,14 @@ public:
|
||||
int il) const override;
|
||||
|
||||
protected:
|
||||
ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override;
|
||||
|
||||
void build_kv_self_shift(
|
||||
llama_graph_result_ptr graph_build_kv_self_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf) override;
|
||||
ggml_cgraph * gf) const;
|
||||
|
||||
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
||||
void build_kv_self_defrag(
|
||||
llama_graph_result_ptr graph_build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf) override;
|
||||
ggml_cgraph * gf) const;
|
||||
|
||||
//
|
||||
// state save/load
|
||||
@@ -651,19 +612,6 @@ public:
|
||||
int encode(llama_batch & inp_batch) override;
|
||||
int decode(llama_batch & inp_batch) override;
|
||||
|
||||
protected:
|
||||
//
|
||||
// input
|
||||
//
|
||||
|
||||
void input_set(const llama_ubatch & ubatch) override;
|
||||
|
||||
private:
|
||||
struct {
|
||||
ggml_tensor * s_copy; // I32 [kv_size]
|
||||
ggml_tensor * s_mask; // F32 [1, n_kv]
|
||||
} inp;
|
||||
|
||||
protected:
|
||||
//
|
||||
// graph
|
||||
@@ -677,10 +625,12 @@ public:
|
||||
//
|
||||
|
||||
ggml_tensor * build_inp_s_copy(
|
||||
ggml_context * ctx0) override;
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0) const override;
|
||||
|
||||
ggml_tensor * build_inp_s_mask(
|
||||
ggml_context * ctx0) override;
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0) const override;
|
||||
|
||||
ggml_tensor * build_copy_mask_state(
|
||||
ggml_context * ctx0,
|
||||
@@ -689,7 +639,7 @@ public:
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
int32_t n_state,
|
||||
int32_t n_seqs) override;
|
||||
int32_t n_seqs) const override;
|
||||
|
||||
ggml_tensor * build_mamba_layer(
|
||||
ggml_context * ctx0,
|
||||
@@ -698,7 +648,7 @@ public:
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) override;
|
||||
int il) const override;
|
||||
|
||||
ggml_tensor * build_rwkv_token_shift_load(
|
||||
ggml_context * ctx0,
|
||||
@@ -706,13 +656,13 @@ public:
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) override;
|
||||
int il) const override;
|
||||
|
||||
ggml_tensor * build_rwkv_token_shift_store(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * token_shift,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) override;
|
||||
int il) const override;
|
||||
|
||||
ggml_tensor * build_rwkv6_time_mix(
|
||||
ggml_context * ctx0,
|
||||
@@ -722,7 +672,7 @@ public:
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) override;
|
||||
int il) const override;
|
||||
|
||||
protected:
|
||||
//
|
||||
@@ -774,18 +724,6 @@ public:
|
||||
protected:
|
||||
void reserve() override;
|
||||
|
||||
//
|
||||
// input
|
||||
//
|
||||
|
||||
void input_set(const llama_ubatch & ubatch) override;
|
||||
|
||||
private:
|
||||
struct {
|
||||
ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
|
||||
} inp;
|
||||
|
||||
protected:
|
||||
//
|
||||
// graph
|
||||
//
|
||||
@@ -793,7 +731,8 @@ protected:
|
||||
ggml_cgraph * graph_init() override;
|
||||
|
||||
ggml_tensor * build_inp_cross_embd(
|
||||
ggml_context * ctx0) override;
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0) const override;
|
||||
|
||||
llama_graph_input_attn_ptr build_attn_inp(
|
||||
llama_graph_result * res,
|
||||
|
||||
@@ -68,25 +68,19 @@ ggml_tensor * llama_graph_i::build_attn_cross(
|
||||
}
|
||||
|
||||
ggml_tensor * llama_graph_i::build_inp_cross_embd(
|
||||
ggml_context * ctx0) {
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0) const {
|
||||
GGML_UNUSED(res);
|
||||
GGML_UNUSED(ctx0);
|
||||
|
||||
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_graph_i::build_inp_cross_kq_mask(
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens) {
|
||||
GGML_UNUSED(ctx0);
|
||||
GGML_UNUSED(n_tokens);
|
||||
|
||||
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_graph_i::build_inp_s_copy (
|
||||
ggml_context * ctx0) {
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0) const {
|
||||
GGML_UNUSED(res);
|
||||
GGML_UNUSED(ctx0);
|
||||
|
||||
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
|
||||
@@ -95,7 +89,9 @@ ggml_tensor * llama_graph_i::build_inp_s_copy (
|
||||
}
|
||||
|
||||
ggml_tensor * llama_graph_i::build_inp_s_mask(
|
||||
ggml_context * ctx0) {
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0) const {
|
||||
GGML_UNUSED(res);
|
||||
GGML_UNUSED(ctx0);
|
||||
|
||||
LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
|
||||
@@ -104,13 +100,13 @@ ggml_tensor * llama_graph_i::build_inp_s_mask(
|
||||
}
|
||||
|
||||
ggml_tensor * llama_graph_i::build_copy_mask_state(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
int32_t n_state,
|
||||
int32_t n_seqs) {
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
int32_t n_state,
|
||||
int32_t n_seqs) const {
|
||||
GGML_UNUSED(ctx0);
|
||||
GGML_UNUSED(gf);
|
||||
GGML_UNUSED(s);
|
||||
@@ -125,13 +121,13 @@ ggml_tensor * llama_graph_i::build_copy_mask_state(
|
||||
}
|
||||
|
||||
ggml_tensor * llama_graph_i::build_mamba_layer(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) {
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
GGML_UNUSED(ctx0);
|
||||
GGML_UNUSED(gf);
|
||||
GGML_UNUSED(cur);
|
||||
@@ -146,12 +142,12 @@ ggml_tensor * llama_graph_i::build_mamba_layer(
|
||||
}
|
||||
|
||||
ggml_tensor * llama_graph_i::build_rwkv_token_shift_load(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) {
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
GGML_UNUSED(ctx0);
|
||||
GGML_UNUSED(gf);
|
||||
GGML_UNUSED(state_copy);
|
||||
@@ -165,10 +161,10 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_load(
|
||||
}
|
||||
|
||||
ggml_tensor * llama_graph_i::build_rwkv_token_shift_store(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * token_shift,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) {
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * token_shift,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
GGML_UNUSED(ctx0);
|
||||
GGML_UNUSED(token_shift);
|
||||
GGML_UNUSED(ubatch);
|
||||
@@ -180,14 +176,14 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_store(
|
||||
}
|
||||
|
||||
ggml_tensor * llama_graph_i::build_rwkv6_time_mix(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * x_prev,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) {
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * x_prev,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
GGML_UNUSED(ctx0);
|
||||
GGML_UNUSED(gf);
|
||||
GGML_UNUSED(cur);
|
||||
|
||||
@@ -93,6 +93,7 @@ public:
|
||||
//
|
||||
|
||||
// TODO: can become more granular in the future
|
||||
// TODO: move all methods that do not require things from llama_context to llm_build_context
|
||||
class llama_graph_i {
|
||||
public:
|
||||
llama_graph_i(llama_graph_type type);
|
||||
@@ -109,28 +110,28 @@ public:
|
||||
ggml_tensor * cur,
|
||||
const char * name,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) = 0;
|
||||
int il) const = 0;
|
||||
|
||||
// apply control vector for layer il
|
||||
virtual ggml_tensor * build_cvec(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
int il) = 0;
|
||||
int il) const = 0;
|
||||
|
||||
// do mat_mul, while optionally apply lora
|
||||
virtual ggml_tensor * build_lora_mm(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * w,
|
||||
ggml_tensor * cur) = 0;
|
||||
ggml_tensor * cur) const = 0;
|
||||
|
||||
// do mat_mul_id, while optionally apply lora
|
||||
virtual ggml_tensor * build_lora_mm_id(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * w, // struct ggml_tensor * as
|
||||
ggml_tensor * cur, // struct ggml_tensor * b
|
||||
ggml_tensor * ids) = 0;
|
||||
ggml_tensor * ids) const = 0;
|
||||
|
||||
virtual ggml_tensor * build_rope_factors(int il) = 0;
|
||||
virtual ggml_tensor * build_rope_factors(int il) const = 0;
|
||||
|
||||
// note: optionally set the backend to be the same as the bbuf's backend
|
||||
virtual ggml_tensor * build_rope_shift(
|
||||
@@ -138,7 +139,7 @@ public:
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
ggml_backend_buffer * bbuf) = 0;
|
||||
ggml_backend_buffer * bbuf) const = 0;
|
||||
|
||||
// graph build API (context-specific)
|
||||
|
||||
@@ -146,26 +147,31 @@ public:
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * tok_embd,
|
||||
const llama_ubatch & ubatch) const = 0; // note these methods will become const, i.e. they don't mutate the llama_context that implements them
|
||||
const llama_ubatch & ubatch) const = 0;
|
||||
|
||||
virtual ggml_tensor * build_inp_pos(
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens) = 0;
|
||||
int32_t n_tokens) const = 0;
|
||||
|
||||
virtual ggml_tensor * build_inp_pos_bucket(
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens) = 0;
|
||||
int32_t n_tokens) const = 0;
|
||||
|
||||
virtual ggml_tensor * build_inp_out_ids(
|
||||
ggml_context * ctx0) = 0;
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0) const = 0;
|
||||
|
||||
virtual ggml_tensor * build_inp_mean(
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens) = 0;
|
||||
int32_t n_tokens) const = 0;
|
||||
|
||||
virtual ggml_tensor * build_inp_cls(
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens) = 0;
|
||||
int32_t n_tokens) const = 0;
|
||||
|
||||
virtual llama_graph_input_attn_ptr build_attn_inp(
|
||||
llama_graph_result * res,
|
||||
@@ -197,17 +203,16 @@ public:
|
||||
int il) const;
|
||||
|
||||
virtual ggml_tensor * build_inp_cross_embd(
|
||||
ggml_context * ctx0);
|
||||
|
||||
virtual ggml_tensor * build_inp_cross_kq_mask(
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens);
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0) const;
|
||||
|
||||
virtual ggml_tensor * build_inp_s_copy(
|
||||
ggml_context * ctx0);
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0) const;
|
||||
|
||||
virtual ggml_tensor * build_inp_s_mask(
|
||||
ggml_context * ctx0);
|
||||
llama_graph_result * res,
|
||||
ggml_context * ctx0) const;
|
||||
|
||||
virtual ggml_tensor * build_copy_mask_state(
|
||||
ggml_context * ctx0,
|
||||
@@ -216,7 +221,7 @@ public:
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
int32_t n_state,
|
||||
int32_t n_seqs);
|
||||
int32_t n_seqs) const;
|
||||
|
||||
virtual ggml_tensor * build_mamba_layer(
|
||||
ggml_context * ctx0,
|
||||
@@ -225,7 +230,7 @@ public:
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il);
|
||||
int il) const;
|
||||
|
||||
virtual ggml_tensor * build_rwkv_token_shift_load(
|
||||
ggml_context * ctx0,
|
||||
@@ -233,13 +238,13 @@ public:
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il);
|
||||
int il) const;
|
||||
|
||||
virtual ggml_tensor * build_rwkv_token_shift_store(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * token_shift,
|
||||
const llama_ubatch & ubatch,
|
||||
int il);
|
||||
int il) const;
|
||||
|
||||
virtual ggml_tensor * build_rwkv6_time_mix(
|
||||
ggml_context * ctx0,
|
||||
@@ -249,5 +254,5 @@ public:
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
int il);
|
||||
int il) const;
|
||||
};
|
||||
|
||||
@@ -3910,7 +3910,7 @@ struct llm_build_context {
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_pos() {
|
||||
ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens);
|
||||
ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens);
|
||||
cb(cur, "inp_pos", -1);
|
||||
|
||||
return cur;
|
||||
@@ -3918,7 +3918,7 @@ struct llm_build_context {
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_out_ids() {
|
||||
ggml_tensor * cur = lgf->build_inp_out_ids(ctx0);
|
||||
ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0);
|
||||
cb(cur, "inp_out_ids", -1);
|
||||
|
||||
return cur;
|
||||
@@ -3926,7 +3926,7 @@ struct llm_build_context {
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_mean() {
|
||||
ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens);
|
||||
ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens);
|
||||
cb(cur, "inp_mean", -1);
|
||||
|
||||
return cur;
|
||||
@@ -3934,7 +3934,7 @@ struct llm_build_context {
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_cls() {
|
||||
ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens);
|
||||
ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens);
|
||||
cb(cur, "inp_cls", -1);
|
||||
|
||||
return cur;
|
||||
@@ -3957,7 +3957,7 @@ struct llm_build_context {
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_pos_bucket() {
|
||||
ggml_tensor * cur = lgf->build_inp_pos_bucket(ctx0, n_tokens);
|
||||
ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens);
|
||||
cb(cur, "pos_bucket", -1);
|
||||
|
||||
return cur;
|
||||
@@ -3965,20 +3965,12 @@ struct llm_build_context {
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_cross_embd() {
|
||||
ggml_tensor * cur = lgf->build_inp_cross_embd(ctx0);
|
||||
ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0);
|
||||
cb(cur, "embd_enc", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_cross_kq_mask() {
|
||||
ggml_tensor * cur = lgf->build_inp_cross_kq_mask(ctx0, n_tokens);
|
||||
cb(cur, "KQ_mask_cross", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_norm(
|
||||
struct ggml_tensor * cur,
|
||||
struct ggml_tensor * mw,
|
||||
@@ -3986,8 +3978,8 @@ struct llm_build_context {
|
||||
llm_norm_type type,
|
||||
int il) {
|
||||
switch (type) {
|
||||
case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break;
|
||||
case LLM_NORM_RMS: cur = ggml_rms_norm (ctx0, cur, hparams.f_norm_rms_eps); break;
|
||||
case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break;
|
||||
case LLM_NORM_RMS: cur = ggml_rms_norm(ctx0, cur, hparams.f_norm_rms_eps); break;
|
||||
case LLM_NORM_GROUP:
|
||||
{
|
||||
cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
|
||||
@@ -8070,8 +8062,8 @@ struct llm_build_context {
|
||||
// {n_embd, n_tokens}
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
|
||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
|
||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
|
||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// norm
|
||||
@@ -10443,8 +10435,8 @@ struct llm_build_context {
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
||||
|
||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
|
||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
|
||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
|
||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
@@ -10535,8 +10527,8 @@ struct llm_build_context {
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0);
|
||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0);
|
||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0);
|
||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0);
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
Reference in New Issue
Block a user