mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-06 09:46:50 +00:00
kv-cache : prepare for abstraction
ggml-ci
This commit is contained in:
@@ -201,7 +201,7 @@ void llama_context::init() {
|
||||
backend_ptrs.push_back(backend.get());
|
||||
}
|
||||
|
||||
const size_t max_nodes = model.max_nodes();
|
||||
const size_t max_nodes = this->max_nodes();
|
||||
|
||||
// buffer used to store the computation graph and the tensor meta data
|
||||
// TODO: move to base class
|
||||
@@ -255,39 +255,36 @@ void llama_context::init() {
|
||||
// reserve pp graph first so that buffers are only allocated once
|
||||
{
|
||||
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||
auto ctx = graph_init();
|
||||
auto res_pp = graph_build(ctx.get(), ubatch_pp, true);
|
||||
auto & gf_pp = res_pp.gf;
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
|
||||
auto * gf = graph_init();
|
||||
graph_build(ctx_compute.get(), gf, ubatch_pp, true);
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
|
||||
throw std::runtime_error("failed to allocate compute buffers");
|
||||
}
|
||||
|
||||
n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
|
||||
n_nodes_pp = ggml_graph_n_nodes(gf_pp);
|
||||
n_nodes_pp = ggml_graph_n_nodes(gf);
|
||||
}
|
||||
|
||||
// reserve with tg graph to get the number of splits and nodes
|
||||
{
|
||||
llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||
auto ctx = graph_init();
|
||||
auto res_tg = graph_build(ctx.get(), ubatch_tg, true);
|
||||
auto & gf_tg = res_tg.gf;
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) {
|
||||
auto * gf = graph_init();
|
||||
graph_build(ctx_compute.get(), gf, ubatch_tg, true);
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
|
||||
throw std::runtime_error("failed to allocate compute buffers");
|
||||
}
|
||||
n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
|
||||
n_nodes_tg = ggml_graph_n_nodes(gf_tg);
|
||||
n_nodes_tg = ggml_graph_n_nodes(gf);
|
||||
}
|
||||
|
||||
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
|
||||
{
|
||||
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||
auto ctx = graph_init();
|
||||
auto res_pp = graph_build(ctx.get(), ubatch_pp, true);
|
||||
auto & gf_pp = res_pp.gf;
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
|
||||
auto * gf = graph_init();
|
||||
graph_build(ctx_compute.get(), gf, ubatch_pp, true);
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
|
||||
throw std::runtime_error("failed to allocate compute buffers");
|
||||
}
|
||||
@@ -350,6 +347,10 @@ uint32_t llama_context::n_threads_batch() const {
|
||||
return cparams.n_threads_batch;
|
||||
}
|
||||
|
||||
int32_t llama_context::max_nodes() const {
|
||||
return std::max<int32_t>(8192, 5*model.n_tensors());
|
||||
}
|
||||
|
||||
enum llama_pooling_type llama_context::pooling_type() const {
|
||||
return cparams.pooling_type;
|
||||
}
|
||||
@@ -555,7 +556,7 @@ void llama_context::synchronize() {
|
||||
t_compute_start_us = 0;
|
||||
}
|
||||
|
||||
ggml_context_ptr llama_context::graph_init() {
|
||||
ggml_cgraph * llama_context::graph_init() {
|
||||
inp_tokens = nullptr;
|
||||
inp_embd = nullptr;
|
||||
inp_pos = nullptr;
|
||||
@@ -569,18 +570,21 @@ ggml_context_ptr llama_context::graph_init() {
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
return ggml_context_ptr { ggml_init(params) };
|
||||
ctx_compute.reset(ggml_init(params));
|
||||
|
||||
return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false);
|
||||
}
|
||||
|
||||
llama_graph_result llama_context::graph_build(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const llama_ubatch & ubatch,
|
||||
bool worst_case) {
|
||||
return model.build_graph(ctx, this, cparams, ubatch, worst_case);
|
||||
return model.build_graph(ctx, gf, this, cparams, ubatch, worst_case);
|
||||
}
|
||||
|
||||
enum ggml_status llama_context::graph_compute(
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
bool batched) {
|
||||
int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads;
|
||||
ggml_threadpool_t tp = batched ? threadpool_batch : threadpool;
|
||||
@@ -596,7 +600,7 @@ enum ggml_status llama_context::graph_compute(
|
||||
set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
|
||||
}
|
||||
|
||||
auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph);
|
||||
auto status = ggml_backend_sched_graph_compute_async(sched.get(), gf);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
|
||||
}
|
||||
@@ -881,7 +885,6 @@ void llama_context::output_reorder() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void llama_context::build_cb(
|
||||
ggml_tensor * cur,
|
||||
const char * name,
|
||||
@@ -1010,6 +1013,55 @@ ggml_tensor * llama_context::build_rope_factors(int il) {
|
||||
return model.layers[il].rope_short;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_context::build_rope_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
ggml_backend_buffer * bbuf) {
|
||||
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
||||
const auto & freq_base = cparams.rope_freq_base;
|
||||
const auto & freq_scale = cparams.rope_freq_scale;
|
||||
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
|
||||
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
|
||||
const auto & n_rot = model.hparams.n_rot;
|
||||
const auto & rope_type = model.hparams.rope_type;
|
||||
|
||||
struct ggml_tensor * tmp;
|
||||
|
||||
if (ggml_is_quantized(cur->type)) {
|
||||
// dequantize to f32 -> RoPE -> quantize back
|
||||
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
|
||||
|
||||
if (bbuf) {
|
||||
for (auto & backend : backends) {
|
||||
// Figure out which backend KV cache belongs to
|
||||
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
|
||||
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tmp = ggml_rope_ext_inplace(ctx0, tmp,
|
||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
|
||||
tmp = ggml_cpy(ctx0, tmp, cur);
|
||||
} else {
|
||||
// we rotate only the first n_rot dimensions
|
||||
tmp = ggml_rope_ext_inplace(ctx0, cur,
|
||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
}
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_context::build_inp_embd(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * tok_embd,
|
||||
@@ -1579,7 +1631,8 @@ void llama_context::perf_reset() {
|
||||
llama_context_kv_self::llama_context_kv_self(
|
||||
const llama_model & model,
|
||||
const llama_context_params & params) :
|
||||
llama_context(model, params) {
|
||||
llama_context(model, params),
|
||||
kv_self(model.hparams) {
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
||||
@@ -1640,13 +1693,13 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const {
|
||||
return &kv_self;
|
||||
}
|
||||
|
||||
ggml_context_ptr llama_context_kv_self::graph_init() {
|
||||
ggml_cgraph * llama_context_kv_self::graph_init() {
|
||||
inp_KQ_mask = nullptr;
|
||||
inp_KQ_mask_cnv = nullptr;
|
||||
inp_KQ_mask_swa = nullptr;
|
||||
inp_KQ_mask_swa_cnv = nullptr;
|
||||
inp_KQ_mask_cross = nullptr;
|
||||
inp_K_shift = nullptr;
|
||||
inp_k_shift = nullptr;
|
||||
inp_s_copy = nullptr;
|
||||
inp_s_mask = nullptr;
|
||||
inp_embd_enc = nullptr;
|
||||
@@ -1719,10 +1772,8 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
auto ctx = graph_init();
|
||||
auto res = graph_build(ctx.get(), ubatch, false);
|
||||
|
||||
auto * gf = res.gf;
|
||||
auto * gf = graph_init();
|
||||
auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
|
||||
|
||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||
|
||||
@@ -1999,12 +2050,12 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
|
||||
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||
|
||||
auto ctx = graph_init();
|
||||
auto res = graph_build(ctx.get(), ubatch, true);
|
||||
auto * gf = graph_init();
|
||||
graph_build(ctx_compute.get(), gf, ubatch, true);
|
||||
|
||||
// initialize scheduler with the worst-case graph
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
if (!ggml_backend_sched_reserve(sched.get(), res.gf)) {
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
}
|
||||
|
||||
@@ -2014,10 +2065,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
auto ctx = graph_init();
|
||||
auto res = graph_build(ctx.get(), ubatch, false);
|
||||
|
||||
auto * gf = res.gf;
|
||||
auto * gf = graph_init();
|
||||
auto res = graph_build(ctx_compute.get(), gf, ubatch, false);
|
||||
|
||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
|
||||
@@ -2195,10 +2244,10 @@ uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) c
|
||||
void llama_context_kv_self::input_set(const llama_ubatch & ubatch) {
|
||||
const llama_hparams & hparams = model.hparams;
|
||||
|
||||
if (inp_K_shift) {
|
||||
assert(ggml_backend_buffer_is_host(inp_K_shift->buffer));
|
||||
if (inp_k_shift) {
|
||||
assert(ggml_backend_buffer_is_host(inp_k_shift->buffer));
|
||||
|
||||
int32_t * data = (int32_t *) inp_K_shift->data;
|
||||
int32_t * data = (int32_t *) inp_k_shift->data;
|
||||
|
||||
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
||||
data[i] = kv_self.cells[i].delta;
|
||||
@@ -2482,11 +2531,9 @@ void llama_context_kv_self::kv_self_update() {
|
||||
if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
auto ctx = graph_init();
|
||||
auto * gf = graph_init();
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false);
|
||||
|
||||
build_kv_self_shift(ctx.get(), gf);
|
||||
kv_self.build_shift(ctx_compute.get(), gf, this);
|
||||
|
||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||
|
||||
@@ -2510,11 +2557,9 @@ void llama_context_kv_self::kv_self_update() {
|
||||
if (kv.do_defrag) {
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
auto ctx = graph_init();
|
||||
auto * gf = graph_init();
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false);
|
||||
|
||||
build_kv_self_defrag(ctx.get(), gf);
|
||||
kv_self.build_defrag(ctx_compute.get(), gf, max_nodes(), !cparams.flash_attn);
|
||||
|
||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||
|
||||
@@ -2529,6 +2574,13 @@ void llama_context_kv_self::kv_self_update() {
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tensor * llama_context_kv_self::build_inp_k_shift(ggml_context * ctx0) {
|
||||
inp_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx());
|
||||
ggml_set_input(inp_k_shift);
|
||||
|
||||
return inp_k_shift;
|
||||
}
|
||||
|
||||
void llama_context_kv_self::build_attn_inp(
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens,
|
||||
@@ -2765,348 +2817,6 @@ ggml_tensor * llama_context_kv_self::build_attn_soft_max(
|
||||
return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
|
||||
}
|
||||
|
||||
void llama_context_kv_self::build_kv_self_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph) {
|
||||
const auto & n_ctx = cparams.n_ctx;
|
||||
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
|
||||
const auto & freq_base = cparams.rope_freq_base;
|
||||
const auto & freq_scale = cparams.rope_freq_scale;
|
||||
|
||||
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
|
||||
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
|
||||
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
|
||||
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const auto & n_rot = hparams.n_rot;
|
||||
const auto & n_layer = hparams.n_layer;
|
||||
const auto & rope_type = hparams.rope_type;
|
||||
|
||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
GGML_ASSERT(kv_self.size == n_ctx);
|
||||
|
||||
inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
||||
//cb(inp_K_shift, "K_shift", -1);
|
||||
ggml_set_input(inp_K_shift);
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
|
||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||
|
||||
struct ggml_tensor * k =
|
||||
ggml_view_3d(ctx0, kv_self.k_l[il],
|
||||
n_embd_head_k, n_head_kv, n_ctx,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
0);
|
||||
|
||||
struct ggml_tensor * tmp;
|
||||
if (ggml_is_quantized(k->type)) {
|
||||
// dequantize to f32 -> RoPE -> quantize back
|
||||
tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
|
||||
//cb(tmp, "K_f32", il);
|
||||
|
||||
for (auto & backend : backends) {
|
||||
// Figure out which backend KV cache belongs to
|
||||
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
|
||||
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
|
||||
break;
|
||||
}
|
||||
}
|
||||
tmp = ggml_rope_ext_inplace(ctx0, tmp,
|
||||
inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
//cb(tmp, "K_shifted_f32", il);
|
||||
|
||||
tmp = ggml_cpy(ctx0, tmp, k);
|
||||
} else {
|
||||
// we rotate only the first n_rot dimensions
|
||||
tmp = ggml_rope_ext_inplace(ctx0, k,
|
||||
inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
}
|
||||
//cb(tmp, "K_shifted", il);
|
||||
|
||||
ggml_build_forward_expand(graph, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_context_kv_self::build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph) {
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
||||
const uint32_t n_kv = kv_self.cell_max();
|
||||
const uint32_t n_used = kv_self.used;
|
||||
|
||||
assert(n_used <= n_kv);
|
||||
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
// number of cells moved
|
||||
uint32_t n_moves = 0;
|
||||
|
||||
// each move requires 6*n_layer tensors (see build_kv_self_defrag)
|
||||
// - source view, destination view, copy operation
|
||||
// - x2 for keys and values
|
||||
//const uint32_t max_moves = model.max_nodes()/(6*n_layer);
|
||||
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
||||
const uint32_t max_moves = (model.max_nodes() - 2*n_layer)/(6*n_layer);
|
||||
|
||||
// determine which KV cells to move where
|
||||
//
|
||||
// cell i moves to ids[i]
|
||||
//
|
||||
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
||||
//
|
||||
std::vector<uint32_t> ids(n_kv, n_kv);
|
||||
|
||||
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
||||
const auto & cell0 = kv_self.cells[i0];
|
||||
|
||||
if (!cell0.is_empty()) {
|
||||
ids[i0] = i0;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// found a hole - fill it with data from the end of the cache
|
||||
|
||||
uint32_t nh = 1;
|
||||
|
||||
// determine the size of the hole
|
||||
while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
|
||||
nh++;
|
||||
}
|
||||
|
||||
uint32_t nf = 0;
|
||||
uint32_t is = n_kv - 1;
|
||||
|
||||
// starting from the end, find nh non-empty cells
|
||||
for (; is > i0; --is) {
|
||||
const auto & cell1 = kv_self.cells[is];
|
||||
|
||||
if (cell1.is_empty() || ids[is] != n_kv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// non-empty cell which is not yet moved
|
||||
nf++;
|
||||
|
||||
if (nf == nh) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// this can only happen if `n_used` is not accurate, which would be a bug
|
||||
GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
|
||||
|
||||
nf = 0;
|
||||
|
||||
uint32_t i1 = is;
|
||||
|
||||
// are we moving a continuous block of memory?
|
||||
bool cont = false;
|
||||
|
||||
// should we stop searching for the next move?
|
||||
bool stop = false;
|
||||
|
||||
// go back and move the nf cells to the hole
|
||||
for (; i1 < n_kv; ++i1) {
|
||||
auto & cell1 = kv_self.cells[i1];
|
||||
|
||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||
if (n_moves == max_moves) {
|
||||
stop = true;
|
||||
break;
|
||||
}
|
||||
|
||||
cont = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// this cell goes to (i0 + nf)
|
||||
ids[i1] = i0 + nf;
|
||||
|
||||
// move the cell meta data
|
||||
kv_self.cells[i0 + nf] = cell1;
|
||||
|
||||
// clear the old cell and move the head there
|
||||
cell1 = llama_kv_cell();
|
||||
kv_self.head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
n_moves++;
|
||||
cont = true;
|
||||
}
|
||||
|
||||
nf++;
|
||||
|
||||
if (nf == nh) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (stop || n_moves == max_moves) {
|
||||
break;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
||||
|
||||
i0 += nh - 1;
|
||||
}
|
||||
|
||||
if (n_moves == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
||||
|
||||
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
// TODO: optimizations are possible:
|
||||
// - multiple threads
|
||||
// - avoid copying to the host memory when already there
|
||||
//
|
||||
// likely not worth the effort, as we have ggml_graph based defrag
|
||||
//
|
||||
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
const uint32_t kv_size = kv_self.size;
|
||||
|
||||
std::vector<uint8_t> buf_k;
|
||||
std::vector<uint8_t> buf_v;
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
||||
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
|
||||
|
||||
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
||||
const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
|
||||
|
||||
buf_k.resize(k_size);
|
||||
buf_v.resize(v_size);
|
||||
|
||||
ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
||||
ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
|
||||
// batch move [i, i+nm) to [id, id+nm)
|
||||
// note: cells can move only to a lower index
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
|
||||
if (i == id || id == n_kv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t nm = 1;
|
||||
|
||||
while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
||||
nm++;
|
||||
}
|
||||
|
||||
// move keys
|
||||
{
|
||||
const int64_t os = i*k_size_row;
|
||||
const int64_t od = id*k_size_row;
|
||||
|
||||
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
||||
}
|
||||
|
||||
// move values (note: they are transposed)
|
||||
{
|
||||
const int64_t os = i;
|
||||
const int64_t od = id;
|
||||
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
||||
}
|
||||
}
|
||||
|
||||
i += nm - 1;
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
||||
ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
}
|
||||
#else
|
||||
for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
|
||||
if (i == id || id == ids.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t nm = 1;
|
||||
|
||||
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
||||
nm++;
|
||||
}
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
||||
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
|
||||
if (cparams.flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||
ggml_row_size(kv_self.v_l[il]->type, i));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||
ggml_row_size(kv_self.v_l[il]->type, id));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
||||
}
|
||||
|
||||
i += nm - 1;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("graph->n_nodes = %d\n", graph->n_nodes);
|
||||
#endif
|
||||
}
|
||||
|
||||
ggml_tensor * llama_context_kv_self::build_inp_embd_enc(
|
||||
ggml_context * ctx0,
|
||||
int32_t n_tokens,
|
||||
@@ -3162,7 +2872,7 @@ ggml_tensor * llama_context_kv_self::build_inp_s_mask(
|
||||
|
||||
ggml_tensor * llama_context_kv_self::build_copy_mask_state(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
@@ -3185,7 +2895,7 @@ ggml_tensor * llama_context_kv_self::build_copy_mask_state(
|
||||
states = ggml_mul(ctx0, states, state_mask);
|
||||
|
||||
// copy states which won't be changed further (between n_seqs and n_kv)
|
||||
ggml_build_forward_expand(graph,
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0,
|
||||
ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*ggml_element_size(states)),
|
||||
ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
|
||||
@@ -3197,7 +2907,7 @@ ggml_tensor * llama_context_kv_self::build_copy_mask_state(
|
||||
// TODO: split
|
||||
ggml_tensor * llama_context_kv_self::build_mamba_layer(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
@@ -3231,11 +2941,11 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer(
|
||||
|
||||
// (ab)using the KV cache to store the states
|
||||
struct ggml_tensor * conv = build_copy_mask_state(
|
||||
ctx0, graph, conv_states_all, state_copy, state_mask,
|
||||
ctx0, gf, conv_states_all, state_copy, state_mask,
|
||||
n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
|
||||
conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
|
||||
struct ggml_tensor * ssm = build_copy_mask_state(
|
||||
ctx0, graph, ssm_states_all, state_copy, state_mask,
|
||||
ctx0, gf, ssm_states_all, state_copy, state_mask,
|
||||
n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
|
||||
ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs);
|
||||
|
||||
@@ -3257,7 +2967,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer(
|
||||
// copy last (d_conv - 1) columns back into the state cache
|
||||
struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
|
||||
|
||||
ggml_build_forward_expand(graph,
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0, last_conv,
|
||||
ggml_view_1d(ctx0, conv_states_all,
|
||||
(d_conv - 1)*(d_inner)*(n_seqs),
|
||||
@@ -3306,7 +3016,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer(
|
||||
struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C);
|
||||
|
||||
// store last states
|
||||
ggml_build_forward_expand(graph,
|
||||
ggml_build_forward_expand(gf,
|
||||
ggml_cpy(ctx0,
|
||||
ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
|
||||
ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
|
||||
@@ -3333,7 +3043,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer(
|
||||
|
||||
ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
@@ -3349,7 +3059,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load(
|
||||
struct ggml_tensor * token_shift_all = kv_self.k_l[il];
|
||||
|
||||
struct ggml_tensor * token_shift = build_copy_mask_state(
|
||||
ctx0, graph, token_shift_all, state_copy, state_mask,
|
||||
ctx0, gf, token_shift_all, state_copy, state_mask,
|
||||
n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case);
|
||||
|
||||
token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
|
||||
@@ -3384,7 +3094,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store(
|
||||
|
||||
ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * x_prev,
|
||||
ggml_tensor * state_copy,
|
||||
@@ -3509,7 +3219,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
|
||||
}
|
||||
|
||||
struct ggml_tensor * wkv_state = build_copy_mask_state(
|
||||
ctx0, graph, kv_self.v_l[il], state_copy, state_mask,
|
||||
ctx0, gf, kv_self.v_l[il], state_copy, state_mask,
|
||||
n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case);
|
||||
|
||||
struct ggml_tensor * wkv_output;
|
||||
@@ -3522,7 +3232,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
|
||||
wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
|
||||
|
||||
ggml_build_forward_expand(
|
||||
graph,
|
||||
gf,
|
||||
ggml_cpy(
|
||||
ctx0,
|
||||
wkv_state,
|
||||
@@ -3558,7 +3268,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix(
|
||||
size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
|
||||
llama_context::state_get_data(io);
|
||||
|
||||
kv_self.state_write(io, model.hparams);
|
||||
kv_self.state_write(io);
|
||||
|
||||
return io.n_bytes();
|
||||
}
|
||||
@@ -3566,7 +3276,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) {
|
||||
size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
|
||||
llama_context::state_set_data(io);
|
||||
|
||||
kv_self.state_read(io, model.hparams);
|
||||
kv_self.state_read(io);
|
||||
|
||||
return io.n_bytes();
|
||||
}
|
||||
@@ -3574,7 +3284,7 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) {
|
||||
size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) {
|
||||
llama_context::state_seq_get_data(io, seq_id);
|
||||
|
||||
kv_self.state_write(io, model.hparams, seq_id);
|
||||
kv_self.state_write(io, seq_id);
|
||||
|
||||
return io.n_bytes();
|
||||
}
|
||||
@@ -3582,7 +3292,7 @@ size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_se
|
||||
size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
llama_context::state_seq_set_data(io, seq_id);
|
||||
|
||||
kv_self.state_read(io, model.hparams, seq_id);
|
||||
kv_self.state_read(io, seq_id);
|
||||
|
||||
return io.n_bytes();
|
||||
}
|
||||
|
||||
@@ -43,6 +43,8 @@ struct llama_context : public llama_graph_i {
|
||||
virtual uint32_t n_threads() const;
|
||||
virtual uint32_t n_threads_batch() const;
|
||||
|
||||
virtual int32_t max_nodes() const;
|
||||
|
||||
virtual llama_kv_cache * get_kv_self() = 0;
|
||||
virtual const llama_kv_cache * get_kv_self() const = 0;
|
||||
|
||||
@@ -93,17 +95,18 @@ struct llama_context : public llama_graph_i {
|
||||
virtual void synchronize();
|
||||
|
||||
// zero-out inputs and create ggml_context
|
||||
virtual ggml_context_ptr graph_init();
|
||||
virtual ggml_cgraph * graph_init();
|
||||
|
||||
// TODO: add encode/decode graphs
|
||||
virtual llama_graph_result graph_build(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
const llama_ubatch & ubatch,
|
||||
bool worst_case);
|
||||
|
||||
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||
virtual enum ggml_status graph_compute(
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
bool batched);
|
||||
|
||||
virtual void input_set(const llama_ubatch & ubatch);
|
||||
@@ -172,6 +175,13 @@ struct llama_context : public llama_graph_i {
|
||||
|
||||
virtual ggml_tensor * build_rope_factors(int il);
|
||||
|
||||
virtual ggml_tensor * build_rope_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
ggml_backend_buffer * bbuf);
|
||||
|
||||
virtual ggml_tensor * build_inp_embd(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * tok_embd,
|
||||
@@ -274,6 +284,8 @@ protected:
|
||||
|
||||
ggml_backend_sched_ptr sched;
|
||||
|
||||
ggml_context_ptr ctx_compute;
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
@@ -332,7 +344,7 @@ public:
|
||||
|
||||
virtual void kv_self_update() override;
|
||||
|
||||
virtual ggml_context_ptr graph_init() override;
|
||||
virtual ggml_cgraph * graph_init() override;
|
||||
|
||||
virtual void input_set(const llama_ubatch & ubatch) override;
|
||||
|
||||
@@ -349,11 +361,13 @@ public:
|
||||
|
||||
llama_kv_cache kv_self;
|
||||
|
||||
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
||||
struct ggml_tensor * inp_KQ_mask_cnv; // [kv_size, n_batch]
|
||||
struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
|
||||
struct ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch]
|
||||
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
||||
ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
||||
ggml_tensor * inp_KQ_mask_cnv; // [kv_size, n_batch]
|
||||
ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
|
||||
ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch]
|
||||
ggml_tensor * inp_k_shift; // I32 [kv_size]
|
||||
|
||||
virtual ggml_tensor * build_inp_k_shift(ggml_context * ctx0) override;
|
||||
|
||||
virtual void build_attn_inp(
|
||||
ggml_context * ctx0,
|
||||
@@ -387,15 +401,6 @@ public:
|
||||
ggml_tensor * kq,
|
||||
float kq_scale) override;
|
||||
|
||||
virtual void build_kv_self_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph) override;
|
||||
|
||||
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
||||
virtual void build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph) override;
|
||||
|
||||
// === encoder-decoder ===
|
||||
|
||||
// whether we are computing encoder output or decoder output
|
||||
|
||||
@@ -8,11 +8,10 @@
|
||||
struct ggml_cgraph;
|
||||
struct ggml_context;
|
||||
struct ggml_tensor;
|
||||
struct ggml_backend_buffer;
|
||||
struct llama_ubatch;
|
||||
|
||||
struct llama_graph_result {
|
||||
ggml_cgraph * gf = nullptr;
|
||||
|
||||
// important graph nodes
|
||||
ggml_tensor * t_logits = nullptr;
|
||||
ggml_tensor * t_embd = nullptr;
|
||||
@@ -50,6 +49,14 @@ public:
|
||||
|
||||
virtual ggml_tensor * build_rope_factors(int il) = 0;
|
||||
|
||||
// note: optionally set the backend to be the same as the bbuf's backend
|
||||
virtual ggml_tensor * build_rope_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * factors,
|
||||
ggml_backend_buffer * bbuft) = 0;
|
||||
|
||||
// graph build API (context-specific)
|
||||
|
||||
virtual ggml_tensor * build_inp_embd(
|
||||
@@ -83,7 +90,7 @@ public:
|
||||
|
||||
virtual void build_attn_kv_store(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * k_cur,
|
||||
ggml_tensor * v_cur,
|
||||
int32_t n_tokens,
|
||||
@@ -92,7 +99,7 @@ public:
|
||||
|
||||
virtual ggml_tensor * build_attn_qkv(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur,
|
||||
@@ -106,14 +113,8 @@ public:
|
||||
ggml_tensor * kq,
|
||||
float kq_scale) = 0;
|
||||
|
||||
virtual void build_kv_self_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph) = 0;
|
||||
|
||||
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
||||
virtual void build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph) = 0;
|
||||
virtual ggml_tensor * build_inp_k_shift(
|
||||
ggml_context * ctx0) = 0;
|
||||
|
||||
virtual ggml_tensor * build_inp_embd_enc(
|
||||
ggml_context * ctx0,
|
||||
@@ -135,7 +136,7 @@ public:
|
||||
|
||||
virtual ggml_tensor * build_copy_mask_state(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
@@ -146,7 +147,7 @@ public:
|
||||
|
||||
virtual ggml_tensor * build_mamba_layer(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
@@ -156,7 +157,7 @@ public:
|
||||
|
||||
virtual ggml_tensor * build_rwkv_token_shift_load(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * state_copy,
|
||||
ggml_tensor * state_mask,
|
||||
const llama_ubatch & ubatch,
|
||||
@@ -172,7 +173,7 @@ public:
|
||||
|
||||
virtual ggml_tensor * build_rwkv6_time_mix(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * graph,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * x_prev,
|
||||
ggml_tensor * state_copy,
|
||||
@@ -181,3 +182,18 @@ public:
|
||||
int il,
|
||||
bool worst_case) = 0;
|
||||
};
|
||||
|
||||
class llama_graph_kv_cache_i {
|
||||
public:
|
||||
virtual void build_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
llama_graph_i * lgf) = 0;
|
||||
|
||||
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
||||
virtual void build_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
int32_t max_nodes,
|
||||
bool v_trans) = 0;
|
||||
};
|
||||
|
||||
@@ -13,6 +13,9 @@
|
||||
|
||||
static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
|
||||
|
||||
llama_kv_cache::llama_kv_cache(const llama_hparams & hparams) : hparams(hparams) {
|
||||
}
|
||||
|
||||
bool llama_kv_cache::init(
|
||||
const llama_model & model,
|
||||
const llama_cparams & cparams,
|
||||
@@ -20,8 +23,6 @@ bool llama_kv_cache::init(
|
||||
ggml_type type_v,
|
||||
uint32_t kv_size,
|
||||
bool offload) {
|
||||
const struct llama_hparams & hparams = model.hparams;
|
||||
|
||||
const int32_t n_layer = hparams.n_layer;
|
||||
|
||||
has_shift = false;
|
||||
@@ -698,7 +699,309 @@ size_t llama_kv_cache::size_v_bytes() const {
|
||||
return size_v_bytes;
|
||||
}
|
||||
|
||||
void llama_kv_cache::state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id) const {
|
||||
void llama_kv_cache::build_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
llama_graph_i * lgf) {
|
||||
const auto & n_layer = hparams.n_layer;
|
||||
|
||||
const auto & n_embd_head_k = hparams.n_embd_head_k;
|
||||
//const auto & n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
//GGML_ASSERT(kv_self.size == n_ctx);
|
||||
|
||||
ggml_tensor * inp_k_shift = lgf->build_inp_k_shift(ctx0);
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
|
||||
struct ggml_tensor * rope_factors = lgf->build_rope_factors(il);
|
||||
|
||||
struct ggml_tensor * k =
|
||||
ggml_view_3d(ctx0, k_l[il],
|
||||
n_embd_head_k, n_head_kv, size,
|
||||
ggml_row_size(k_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
|
||||
0);
|
||||
|
||||
ggml_tensor * cur = lgf->build_rope_shift(ctx0, k, inp_k_shift, rope_factors, k_l[il]->buffer);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache::build_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
int32_t max_nodes,
|
||||
bool v_trans) {
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
||||
const uint32_t n_kv = cell_max();
|
||||
const uint32_t n_used = used;
|
||||
|
||||
assert(n_used <= n_kv);
|
||||
|
||||
//const int64_t t_start = ggml_time_us();
|
||||
|
||||
// number of cells moved
|
||||
uint32_t n_moves = 0;
|
||||
|
||||
// each move requires 6*n_layer tensors (see build_kv_self_defrag)
|
||||
// - source view, destination view, copy operation
|
||||
// - x2 for keys and values
|
||||
//const uint32_t max_moves = max_nodes/(6*n_layer);
|
||||
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
||||
const uint32_t max_moves = (max_nodes - 2*n_layer)/(6*n_layer);
|
||||
|
||||
// determine which KV cells to move where
|
||||
//
|
||||
// cell i moves to ids[i]
|
||||
//
|
||||
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
||||
//
|
||||
std::vector<uint32_t> ids(n_kv, n_kv);
|
||||
|
||||
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
||||
const auto & cell0 = cells[i0];
|
||||
|
||||
if (!cell0.is_empty()) {
|
||||
ids[i0] = i0;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// found a hole - fill it with data from the end of the cache
|
||||
|
||||
uint32_t nh = 1;
|
||||
|
||||
// determine the size of the hole
|
||||
while (i0 + nh < n_used && cells[i0 + nh].is_empty()) {
|
||||
nh++;
|
||||
}
|
||||
|
||||
uint32_t nf = 0;
|
||||
uint32_t is = n_kv - 1;
|
||||
|
||||
// starting from the end, find nh non-empty cells
|
||||
for (; is > i0; --is) {
|
||||
const auto & cell1 = cells[is];
|
||||
|
||||
if (cell1.is_empty() || ids[is] != n_kv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// non-empty cell which is not yet moved
|
||||
nf++;
|
||||
|
||||
if (nf == nh) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// this can only happen if `n_used` is not accurate, which would be a bug
|
||||
GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
|
||||
|
||||
nf = 0;
|
||||
|
||||
uint32_t i1 = is;
|
||||
|
||||
// are we moving a continuous block of memory?
|
||||
bool cont = false;
|
||||
|
||||
// should we stop searching for the next move?
|
||||
bool stop = false;
|
||||
|
||||
// go back and move the nf cells to the hole
|
||||
for (; i1 < n_kv; ++i1) {
|
||||
auto & cell1 = cells[i1];
|
||||
|
||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||
if (n_moves == max_moves) {
|
||||
stop = true;
|
||||
break;
|
||||
}
|
||||
|
||||
cont = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// this cell goes to (i0 + nf)
|
||||
ids[i1] = i0 + nf;
|
||||
|
||||
// move the cell meta data
|
||||
cells[i0 + nf] = cell1;
|
||||
|
||||
// clear the old cell and move the head there
|
||||
cell1 = llama_kv_cell();
|
||||
head = n_used;
|
||||
|
||||
if (!cont) {
|
||||
n_moves++;
|
||||
cont = true;
|
||||
}
|
||||
|
||||
nf++;
|
||||
|
||||
if (nf == nh) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (stop || n_moves == max_moves) {
|
||||
break;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
||||
|
||||
i0 += nh - 1;
|
||||
}
|
||||
|
||||
if (n_moves == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
||||
|
||||
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
||||
|
||||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
// TODO: optimizations are possible:
|
||||
// - multiple threads
|
||||
// - avoid copying to the host memory when already there
|
||||
//
|
||||
// likely not worth the effort, as we have ggml_graph based defrag
|
||||
//
|
||||
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
const uint32_t kv_size = size;
|
||||
|
||||
std::vector<uint8_t> buf_k;
|
||||
std::vector<uint8_t> buf_v;
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
|
||||
const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
|
||||
|
||||
const size_t v_size_el = ggml_type_size(v_l[il]->type);
|
||||
const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
|
||||
|
||||
buf_k.resize(k_size);
|
||||
buf_v.resize(v_size);
|
||||
|
||||
ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
|
||||
ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
|
||||
// batch move [i, i+nm) to [id, id+nm)
|
||||
// note: cells can move only to a lower index
|
||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
|
||||
if (i == id || id == n_kv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t nm = 1;
|
||||
|
||||
while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
||||
nm++;
|
||||
}
|
||||
|
||||
// move keys
|
||||
{
|
||||
const int64_t os = i*k_size_row;
|
||||
const int64_t od = id*k_size_row;
|
||||
|
||||
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
||||
}
|
||||
|
||||
// move values (note: they are transposed)
|
||||
{
|
||||
const int64_t os = i;
|
||||
const int64_t od = id;
|
||||
|
||||
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
||||
}
|
||||
}
|
||||
|
||||
i += nm - 1;
|
||||
}
|
||||
|
||||
ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
|
||||
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
}
|
||||
#else
|
||||
for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
|
||||
if (i == id || id == ids.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t nm = 1;
|
||||
|
||||
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
||||
nm++;
|
||||
}
|
||||
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(k_l[il]->type, n_embd_k_gqa*i));
|
||||
|
||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(k_l[il]->type, n_embd_k_gqa*id));
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
|
||||
if (!v_trans) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
ggml_row_size(v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(v_l[il]->type, n_embd_v_gqa*i));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, v_l[il],
|
||||
n_embd_v_gqa, nm,
|
||||
ggml_row_size(v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(v_l[il]->type, n_embd_v_gqa*id));
|
||||
} else {
|
||||
view_v_src = ggml_view_2d(ctx0, v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
ggml_row_size(v_l[il]->type, size),
|
||||
ggml_row_size(v_l[il]->type, i));
|
||||
|
||||
view_v_dst = ggml_view_2d(ctx0, v_l[il],
|
||||
nm, n_embd_v_gqa,
|
||||
ggml_row_size(v_l[il]->type, size),
|
||||
ggml_row_size(v_l[il]->type, id));
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
||||
}
|
||||
|
||||
i += nm - 1;
|
||||
}
|
||||
|
||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||
#endif
|
||||
}
|
||||
|
||||
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
|
||||
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
||||
uint32_t cell_count = 0;
|
||||
|
||||
@@ -733,16 +1036,16 @@ void llama_kv_cache::state_write(llama_io_write_i & io, const llama_hparams & hp
|
||||
io.write(&cell_count, sizeof(cell_count));
|
||||
|
||||
state_write_meta(io, cell_ranges, seq_id);
|
||||
state_write_data(io, cell_ranges, hparams);
|
||||
state_write_data(io, cell_ranges);
|
||||
}
|
||||
|
||||
void llama_kv_cache::state_read(llama_io_read_i & io, const llama_hparams & hparams, llama_seq_id seq_id) {
|
||||
void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
|
||||
uint32_t cell_count;
|
||||
io.read_to(&cell_count, sizeof(cell_count));
|
||||
|
||||
bool res = true;
|
||||
res = res && state_read_meta(io, cell_count, seq_id);
|
||||
res = res && state_read_data(io, hparams, cell_count);
|
||||
res = res && state_read_data(io, cell_count);
|
||||
|
||||
if (!res) {
|
||||
if (seq_id == -1) {
|
||||
@@ -773,7 +1076,7 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const std::vector<s
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const {
|
||||
void llama_kv_cache::state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const {
|
||||
const uint32_t v_trans = this->v_trans ? 1 : 0;
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
||||
@@ -955,7 +1258,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t cell_count,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llama_kv_cache::state_read_data(llama_io_read_i & io, const llama_hparams & hparams, uint32_t cell_count) {
|
||||
bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t cell_count) {
|
||||
uint32_t v_trans;
|
||||
uint32_t n_layer;
|
||||
io.read_to(&v_trans, sizeof(v_trans));
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
|
||||
#include "llama.h"
|
||||
#include "llama-io.h"
|
||||
#include "llama-graph.h"
|
||||
|
||||
#include "ggml-cpp.h"
|
||||
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
|
||||
struct llama_cparams;
|
||||
struct llama_hparams;
|
||||
@@ -49,31 +49,13 @@ struct llama_kv_cache_slot_info {
|
||||
// TODO: pimpl
|
||||
// TODO: add notion of max sequences
|
||||
// TODO: add llama_hparams &
|
||||
struct llama_kv_cache {
|
||||
bool has_shift = false;
|
||||
bool do_defrag = false;
|
||||
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
bool can_shift = false;
|
||||
|
||||
// Note: The value of head isn't only used to optimize searching
|
||||
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||
// cannot be freely changed after a slot has been allocated.
|
||||
uint32_t head = 0;
|
||||
uint32_t size = 0;
|
||||
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
||||
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
std::vector<llama_kv_cell> cells;
|
||||
|
||||
std::vector<struct ggml_tensor *> k_l; // per layer
|
||||
std::vector<struct ggml_tensor *> v_l;
|
||||
struct llama_kv_cache : public llama_graph_kv_cache_i {
|
||||
llama_kv_cache(const llama_hparams & hparams);
|
||||
virtual ~llama_kv_cache() = default;
|
||||
|
||||
// TODO: become constructor
|
||||
bool init(
|
||||
const llama_model & model,
|
||||
const llama_model & model, // TODO: do not reference the model
|
||||
const llama_cparams & cparams,
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
@@ -115,8 +97,48 @@ struct llama_kv_cache {
|
||||
size_t size_k_bytes() const;
|
||||
size_t size_v_bytes() const;
|
||||
|
||||
void state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const;
|
||||
void state_read (llama_io_read_i & io, const llama_hparams & hparams, llama_seq_id seq_id = -1);
|
||||
// graph build API
|
||||
|
||||
virtual void build_shift(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
llama_graph_i * lgf) override;
|
||||
|
||||
virtual void build_defrag(
|
||||
ggml_context * ctx0,
|
||||
ggml_cgraph * gf,
|
||||
int32_t max_nodes,
|
||||
bool v_trans) override;
|
||||
|
||||
// state save/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
|
||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1);
|
||||
|
||||
// members
|
||||
|
||||
const llama_hparams & hparams;
|
||||
|
||||
bool has_shift = false;
|
||||
bool do_defrag = false;
|
||||
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
bool can_shift = false;
|
||||
|
||||
// Note: The value of head isn't only used to optimize searching
|
||||
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||
// cannot be freely changed after a slot has been allocated.
|
||||
uint32_t head = 0;
|
||||
uint32_t size = 0;
|
||||
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
||||
|
||||
// computed before each graph build
|
||||
uint32_t n = 0;
|
||||
|
||||
std::vector<llama_kv_cell> cells;
|
||||
|
||||
std::vector<struct ggml_tensor *> k_l; // per layer
|
||||
std::vector<struct ggml_tensor *> v_l;
|
||||
|
||||
private:
|
||||
ggml_type type_k = GGML_TYPE_F16;
|
||||
@@ -126,10 +148,10 @@ private:
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, const llama_hparams & hparams) const;
|
||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||
|
||||
bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
||||
bool state_read_data(llama_io_read_i & io, const llama_hparams & hparams, uint32_t cell_count);
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
||||
};
|
||||
|
||||
//
|
||||
|
||||
@@ -3579,8 +3579,8 @@ size_t llama_model::size() const {
|
||||
return pimpl->n_bytes;
|
||||
}
|
||||
|
||||
size_t llama_model::max_nodes() const {
|
||||
return std::max<size_t>(8192, tensors_by_name.size()*5);
|
||||
size_t llama_model::n_tensors() const {
|
||||
return tensors_by_name.size();
|
||||
}
|
||||
|
||||
size_t llama_model::n_devices() const {
|
||||
@@ -3900,6 +3900,38 @@ struct llm_build_context {
|
||||
return inpL;
|
||||
}
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_pos() {
|
||||
ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens);
|
||||
cb(cur, "inp_pos", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_out_ids() {
|
||||
ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case);
|
||||
cb(cur, "inp_out_ids", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_mean() {
|
||||
ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens);
|
||||
cb(cur, "inp_mean", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_cls() {
|
||||
ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens);
|
||||
cb(cur, "inp_cls", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_lora_mm(
|
||||
struct ggml_tensor * w,
|
||||
@@ -3915,6 +3947,22 @@ struct llm_build_context {
|
||||
return lgf->build_lora_mm_id(ctx0, w, cur, ids);
|
||||
}
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_embd_enc() {
|
||||
ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case);
|
||||
cb(cur, "embd_enc", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
// TODO: tmp
|
||||
struct ggml_tensor * build_inp_KQ_mask_cross() {
|
||||
ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
|
||||
cb(cur, "KQ_mask_cross", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_norm(
|
||||
struct ggml_tensor * cur,
|
||||
struct ggml_tensor * mw,
|
||||
@@ -4195,7 +4243,7 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_attn(
|
||||
struct ggml_cgraph * graph,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_tensor * wo,
|
||||
struct ggml_tensor * wo_b,
|
||||
struct ggml_tensor * k_cur,
|
||||
@@ -4206,17 +4254,17 @@ struct llm_build_context {
|
||||
int il) {
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
// by doing so, the number of splits in the graph is reduced
|
||||
ggml_build_forward_expand(graph, q_cur);
|
||||
ggml_build_forward_expand(graph, k_cur);
|
||||
ggml_build_forward_expand(graph, v_cur);
|
||||
ggml_build_forward_expand(gf, q_cur);
|
||||
ggml_build_forward_expand(gf, k_cur);
|
||||
ggml_build_forward_expand(gf, v_cur);
|
||||
|
||||
//build_kv_store(graph, k_cur, v_cur, il);
|
||||
lgf->build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case);
|
||||
//build_kv_store(gf, k_cur, v_cur, il);
|
||||
lgf->build_attn_kv_store(ctx0, gf, k_cur, v_cur, n_tokens, il, worst_case);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
//cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il);
|
||||
cur = lgf->build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
|
||||
//cur = build_kqv(gf, wo, wo_b, q_cur, kq_mask, kq_scale, il);
|
||||
cur = lgf->build_attn_qkv(ctx0, gf, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
return cur;
|
||||
@@ -4251,34 +4299,6 @@ struct llm_build_context {
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_pos() {
|
||||
ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens);
|
||||
cb(cur, "inp_pos", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_out_ids() {
|
||||
ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case);
|
||||
cb(cur, "inp_out_ids", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_mean() {
|
||||
ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens);
|
||||
cb(cur, "inp_mean", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_cls() {
|
||||
ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens);
|
||||
cb(cur, "inp_cls", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
void append_pooling(struct ggml_cgraph * gf) {
|
||||
struct ggml_tensor * inp = res.t_embd;
|
||||
|
||||
@@ -4377,20 +4397,6 @@ struct llm_build_context {
|
||||
// return pos_bias;
|
||||
//}
|
||||
|
||||
struct ggml_tensor * build_inp_embd_enc() {
|
||||
ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case);
|
||||
cb(cur, "embd_enc", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_KQ_mask_cross() {
|
||||
ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case);
|
||||
cb(cur, "KQ_mask_cross", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
void build_llama(ggml_cgraph * gf) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
|
||||
@@ -10936,16 +10942,13 @@ struct llm_build_context {
|
||||
|
||||
llama_graph_result llama_model::build_graph(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
llama_graph_i * lgf,
|
||||
const llama_cparams & cparams,
|
||||
const llama_ubatch & ubatch,
|
||||
bool worst_case) const {
|
||||
struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch, worst_case);
|
||||
|
||||
auto & gf = llm.res.gf;
|
||||
|
||||
gf = ggml_new_graph_custom(llm.ctx0, max_nodes(), false);
|
||||
|
||||
switch (arch) {
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_MINICPM:
|
||||
|
||||
@@ -353,7 +353,7 @@ struct llama_model {
|
||||
std::string desc() const;
|
||||
|
||||
size_t size() const;
|
||||
size_t max_nodes() const;
|
||||
size_t n_tensors() const;
|
||||
size_t n_devices() const;
|
||||
|
||||
// total number of parameters in the model
|
||||
@@ -371,6 +371,7 @@ struct llama_model {
|
||||
// TODO: add encode/decode graphs
|
||||
llama_graph_result build_graph(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
llama_graph_i * lgf,
|
||||
const llama_cparams & cparams,
|
||||
const llama_ubatch & ubatch,
|
||||
|
||||
Reference in New Issue
Block a user