mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-08 10:07:01 +00:00
model : pass llama_graph_i as ptr
ggml-ci
This commit is contained in:
@@ -576,7 +576,7 @@ llama_graph_result llama_context::graph_build(
|
||||
ggml_context * ctx,
|
||||
const llama_ubatch & ubatch,
|
||||
bool worst_case) {
|
||||
return model.build_graph(ctx, *this, cparams, ubatch, worst_case);
|
||||
return model.build_graph(ctx, this, cparams, ubatch, worst_case);
|
||||
}
|
||||
|
||||
enum ggml_status llama_context::graph_compute(
|
||||
|
||||
@@ -3804,7 +3804,6 @@ enum llm_norm_type {
|
||||
};
|
||||
|
||||
struct llm_build_context {
|
||||
llama_graph_i & lgf;
|
||||
const llama_model & model;
|
||||
const llama_hparams & hparams;
|
||||
const llama_cparams & cparams;
|
||||
@@ -3842,18 +3841,18 @@ struct llm_build_context {
|
||||
const enum llama_rope_type rope_type;
|
||||
|
||||
ggml_context * ctx0 = nullptr;
|
||||
llama_graph_i * lgf = nullptr;
|
||||
|
||||
llama_graph_result res;
|
||||
|
||||
// TODO: consider making the entire interface noexcept
|
||||
llm_build_context(
|
||||
ggml_context * ctx,
|
||||
llama_graph_i & lgf,
|
||||
llama_graph_i * lgf,
|
||||
const llama_model & model,
|
||||
const llama_cparams & cparams,
|
||||
const llama_ubatch & ubatch,
|
||||
bool worst_case) :
|
||||
lgf (lgf),
|
||||
model (model),
|
||||
hparams (model.hparams),
|
||||
cparams (cparams),
|
||||
@@ -3884,17 +3883,18 @@ struct llm_build_context {
|
||||
flash_attn (cparams.flash_attn),
|
||||
pooling_type (cparams.pooling_type),
|
||||
rope_type (hparams.rope_type),
|
||||
ctx0 (ctx) {
|
||||
ctx0 (ctx),
|
||||
lgf (lgf) {
|
||||
}
|
||||
|
||||
// TODO: tmp
|
||||
void cb(struct ggml_tensor * cur, const char * name, int il) {
|
||||
lgf.build_cb(cur, name, ubatch, il);
|
||||
lgf->build_cb(cur, name, ubatch, il);
|
||||
}
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_embd(struct ggml_tensor * tok_embd) {
|
||||
struct ggml_tensor * inpL = lgf.build_inp_embd(ctx0, tok_embd, ubatch);
|
||||
struct ggml_tensor * inpL = lgf->build_inp_embd(ctx0, tok_embd, ubatch);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
return inpL;
|
||||
@@ -3904,7 +3904,7 @@ struct llm_build_context {
|
||||
struct ggml_tensor * build_lora_mm(
|
||||
struct ggml_tensor * w,
|
||||
struct ggml_tensor * cur) {
|
||||
return lgf.build_lora_mm(ctx0, w, cur);
|
||||
return lgf->build_lora_mm(ctx0, w, cur);
|
||||
}
|
||||
|
||||
// TODO: tmp
|
||||
@@ -3912,7 +3912,7 @@ struct llm_build_context {
|
||||
struct ggml_tensor * w, // struct ggml_tensor * as
|
||||
struct ggml_tensor * cur, // struct ggml_tensor * b
|
||||
struct ggml_tensor * ids) {
|
||||
return lgf.build_lora_mm_id(ctx0, w, cur, ids);
|
||||
return lgf->build_lora_mm_id(ctx0, w, cur, ids);
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_norm(
|
||||
@@ -4211,12 +4211,12 @@ struct llm_build_context {
|
||||
ggml_build_forward_expand(graph, v_cur);
|
||||
|
||||
//build_kv_store(graph, k_cur, v_cur, il);
|
||||
lgf.build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
|
||||
lgf->build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
//cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il);
|
||||
cur = lgf.build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
|
||||
cur = lgf->build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
return cur;
|
||||
@@ -4252,28 +4252,28 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_pos() {
|
||||
ggml_tensor * cur = lgf.build_inp_pos(ctx0, n_tokens);
|
||||
ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens);
|
||||
cb(cur, "inp_pos", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_out_ids() {
|
||||
ggml_tensor * cur = lgf.build_inp_out_ids(ctx0, n_tokens, worst_case);
|
||||
ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case);
|
||||
cb(cur, "inp_out_ids", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_mean() {
|
||||
ggml_tensor * cur = lgf.build_inp_mean(ctx0, n_tokens);
|
||||
ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens);
|
||||
cb(cur, "inp_mean", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_cls() {
|
||||
ggml_tensor * cur = lgf.build_inp_cls(ctx0, n_tokens);
|
||||
ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens);
|
||||
cb(cur, "inp_cls", -1);
|
||||
|
||||
return cur;
|
||||
@@ -4378,14 +4378,14 @@ struct llm_build_context {
|
||||
//}
|
||||
|
||||
struct ggml_tensor * build_inp_embd_enc() {
|
||||
ggml_tensor * cur = lgf.build_inp_embd_enc(ctx0, n_tokens, worst_case);
|
||||
ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case);
|
||||
cb(cur, "embd_enc", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_KQ_mask_cross() {
|
||||
ggml_tensor * cur = lgf.build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
|
||||
ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
|
||||
cb(cur, "KQ_mask_cross", -1);
|
||||
|
||||
return cur;
|
||||
@@ -4405,7 +4405,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -4420,7 +4420,7 @@ struct llm_build_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
|
||||
struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -4522,7 +4522,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -4566,7 +4566,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -4592,7 +4592,7 @@ struct llm_build_context {
|
||||
} else if (n_head > 0) {
|
||||
// self-attention
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
|
||||
struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -4678,7 +4678,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -4722,7 +4722,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -4799,7 +4799,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -4838,7 +4838,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -4905,7 +4905,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -4943,7 +4943,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * attn_norm;
|
||||
@@ -5023,7 +5023,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -5066,7 +5066,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -5173,7 +5173,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -5218,7 +5218,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -5301,7 +5301,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -5340,7 +5340,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
cb(pos, "pos_embd", -1);
|
||||
@@ -5408,7 +5408,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -5441,7 +5441,7 @@ struct llm_build_context {
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -5501,7 +5501,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -5555,7 +5555,7 @@ struct llm_build_context {
|
||||
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
||||
cb(inpL, "inp_norm", -1);
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, false, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, false, false, worst_case);
|
||||
|
||||
// iterate layers
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -5626,7 +5626,7 @@ struct llm_build_context {
|
||||
cb(kq, "kq", il);
|
||||
|
||||
//kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
||||
kq = lgf.build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
|
||||
kq = lgf->build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
|
||||
cb(kq, "kq_soft_max_ext", il);
|
||||
|
||||
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
||||
@@ -5728,7 +5728,7 @@ struct llm_build_context {
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
inpL = build_norm(inpL,
|
||||
model.tok_norm,
|
||||
@@ -5796,7 +5796,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -5831,7 +5831,7 @@ struct llm_build_context {
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
if (model.pos_embd) {
|
||||
// inp_pos - contains the positions
|
||||
@@ -5935,7 +5935,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -5973,7 +5973,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
@@ -6085,7 +6085,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -6124,7 +6124,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -6199,7 +6199,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -6238,7 +6238,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -6312,7 +6312,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -6351,7 +6351,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
@@ -6430,7 +6430,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -6469,7 +6469,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -6575,7 +6575,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -6616,7 +6616,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
attn_norm_output = build_norm(inpL,
|
||||
@@ -6698,7 +6698,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_output);
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -6739,7 +6739,7 @@ struct llm_build_context {
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto * residual = inpL;
|
||||
@@ -6747,7 +6747,7 @@ struct llm_build_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for 128k context
|
||||
struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
|
||||
struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
|
||||
|
||||
struct ggml_tensor* attn_norm_output = build_norm(inpL,
|
||||
model.layers[il].attn_norm,
|
||||
@@ -6841,7 +6841,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, residual, cur);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -6883,7 +6883,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
@@ -6949,7 +6949,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, sa_out);
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -6989,7 +6989,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
cb(pos, "pos_embd", -1);
|
||||
@@ -7057,7 +7057,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -7095,7 +7095,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
cur = build_norm(inpL,
|
||||
@@ -7169,7 +7169,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -7206,7 +7206,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -7286,7 +7286,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -7325,7 +7325,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -7405,7 +7405,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -7453,12 +7453,12 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
|
||||
struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
|
||||
// norm
|
||||
cur = build_norm(inpL,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
@@ -7610,7 +7610,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -7654,7 +7654,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// norm
|
||||
@@ -7723,7 +7723,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, sa_out);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -7762,7 +7762,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// norm
|
||||
@@ -7847,7 +7847,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, sa_out);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -7892,7 +7892,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -7973,7 +7973,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -8005,8 +8005,8 @@ struct llm_build_context {
|
||||
// {n_embd, n_tokens}
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
|
||||
struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
|
||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
|
||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// norm
|
||||
@@ -8016,7 +8016,7 @@ struct llm_build_context {
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
//cur = build_mamba_layer(gf, cur, state_copy, state_mask, il);
|
||||
cur = lgf.build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
|
||||
cur = lgf->build_mamba_layer(ctx0, gf, cur, state_copy, state_mask, ubatch, il, worst_case);
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
@@ -8028,7 +8028,7 @@ struct llm_build_context {
|
||||
// residual
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -8067,7 +8067,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
@@ -8171,7 +8171,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cur = ggml_add(ctx0, cur, attn_out);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -8215,7 +8215,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, true, worst_case);
|
||||
|
||||
// sliding window switch pattern
|
||||
const int32_t sliding_window_pattern = 4;
|
||||
@@ -8233,7 +8233,7 @@ struct llm_build_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for 128k context
|
||||
struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
|
||||
struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -8302,7 +8302,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cur = ggml_add(ctx0, cur, attn_out);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -8349,7 +8349,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -8430,7 +8430,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -8469,7 +8469,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -8550,7 +8550,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -8593,7 +8593,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -8677,7 +8677,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -8714,7 +8714,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_head = hparams.n_head(il);
|
||||
@@ -8804,7 +8804,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
inpL = cur;
|
||||
@@ -8842,7 +8842,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
cur = build_norm(inpL,
|
||||
@@ -8919,7 +8919,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, attn_out);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -8948,7 +8948,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -8986,7 +8986,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -9077,7 +9077,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_out);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -9116,7 +9116,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
@@ -9132,7 +9132,7 @@ struct llm_build_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
|
||||
struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -9232,7 +9232,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -9279,7 +9279,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -9459,7 +9459,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -9497,7 +9497,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -9978,7 +9978,7 @@ struct llm_build_context {
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
cur = build_norm(inpL,
|
||||
@@ -10072,7 +10072,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -10202,7 +10202,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -10284,7 +10284,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -10323,7 +10323,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -10337,7 +10337,7 @@ struct llm_build_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
struct ggml_tensor * rope_factors = lgf.build_rope_factors(il);
|
||||
struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -10407,7 +10407,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -10441,8 +10441,8 @@ struct llm_build_context {
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
|
||||
|
||||
struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
|
||||
struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
|
||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
|
||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
@@ -10451,7 +10451,7 @@ struct llm_build_context {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const llama_layer * layer = &model.layers[il];
|
||||
|
||||
struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load(
|
||||
struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load(
|
||||
ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
|
||||
);
|
||||
|
||||
@@ -10468,7 +10468,7 @@ struct llm_build_context {
|
||||
1
|
||||
);
|
||||
|
||||
cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
|
||||
cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
@@ -10491,13 +10491,13 @@ struct llm_build_context {
|
||||
ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)),
|
||||
1
|
||||
);
|
||||
ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
|
||||
ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
|
||||
|
||||
if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
|
||||
cur = ggml_scale(ctx0, cur, 0.5F);
|
||||
}
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -10533,8 +10533,8 @@ struct llm_build_context {
|
||||
|
||||
inpL = build_inp_embd(model.tok_embd);
|
||||
|
||||
struct ggml_tensor * state_copy = lgf.build_inp_s_copy(ctx0, worst_case);
|
||||
struct ggml_tensor * state_mask = lgf.build_inp_s_mask(ctx0, worst_case);
|
||||
struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0, worst_case);
|
||||
struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0, worst_case);
|
||||
|
||||
const auto n_embd = hparams.n_embd;
|
||||
const auto n_seq_tokens = ubatch.n_seq_tokens;
|
||||
@@ -10545,7 +10545,7 @@ struct llm_build_context {
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const llama_layer * layer = &model.layers[il];
|
||||
|
||||
struct ggml_tensor * token_shift = lgf.build_rwkv_token_shift_load(
|
||||
struct ggml_tensor * token_shift = lgf->build_rwkv_token_shift_load(
|
||||
ctx0, gf, state_copy, state_mask, ubatch, il, worst_case
|
||||
);
|
||||
|
||||
@@ -10559,10 +10559,10 @@ struct llm_build_context {
|
||||
1
|
||||
);
|
||||
|
||||
cur = lgf.build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
|
||||
cur = lgf->build_rwkv6_time_mix(ctx0, gf, att_norm, x_prev, state_copy, state_mask, ubatch, il, worst_case);
|
||||
|
||||
token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm));
|
||||
ggml_build_forward_expand(gf, lgf.build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
|
||||
ggml_build_forward_expand(gf, lgf->build_rwkv_token_shift_store(ctx0, token_shift, ubatch, il, worst_case));
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
@@ -10583,7 +10583,7 @@ struct llm_build_context {
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -10628,7 +10628,7 @@ struct llm_build_context {
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
lgf.build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
lgf->build_attn_inp(ctx0, n_tokens, true, false, worst_case);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
@@ -10744,7 +10744,7 @@ struct llm_build_context {
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = lgf.build_cvec(ctx0, cur, il);
|
||||
cur = lgf->build_cvec(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -10936,7 +10936,7 @@ struct llm_build_context {
|
||||
|
||||
llama_graph_result llama_model::build_graph(
|
||||
ggml_context * ctx,
|
||||
llama_graph_i & lgf,
|
||||
llama_graph_i * lgf,
|
||||
const llama_cparams & cparams,
|
||||
const llama_ubatch & ubatch,
|
||||
bool worst_case) const {
|
||||
|
||||
@@ -371,7 +371,7 @@ struct llama_model {
|
||||
// TODO: add encode/decode graphs
|
||||
llama_graph_result build_graph(
|
||||
ggml_context * ctx,
|
||||
llama_graph_i & lgf,
|
||||
llama_graph_i * lgf,
|
||||
const llama_cparams & cparams,
|
||||
const llama_ubatch & ubatch,
|
||||
bool worst_case) const;
|
||||
|
||||
Reference in New Issue
Block a user