From 9cab53c7ddeb029c7aeb787cf9fa7ea1779ba4b4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 28 Feb 2025 18:01:25 +0200 Subject: [PATCH] cont : migrate the rest of the inputs out of llama_context ggml-ci --- src/llama-context.cpp | 920 ++++++++++++++++++++++++------------------ src/llama-context.h | 127 ++---- src/llama-graph.cpp | 86 ++-- src/llama-graph.h | 53 +-- src/llama-model.cpp | 36 +- 5 files changed, 645 insertions(+), 577 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 5ac28f9830..8587f480fd 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -71,6 +71,243 @@ void llama_graph_input_embd::set_input(const llama_ubatch * ubatch) { } } +class llama_graph_input_pos : public llama_graph_input_i { +public: + llama_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} + virtual ~llama_graph_input_pos() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * pos = nullptr; // I32 [n_batch] + + const int64_t n_pos_per_token = 1; +}; + +void llama_graph_input_pos::set_input(const llama_ubatch * ubatch) { + if (ubatch->pos && pos) { + const int64_t n_tokens = ubatch->n_tokens; + + ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); + } +} + +class llama_graph_input_pos_bucket : public llama_graph_input_i { +public: + llama_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {} + virtual ~llama_graph_input_pos_bucket() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] + + const llama_hparams & hparams; +}; + +void llama_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) { + if (pos_bucket) { + const int64_t n_tokens = ubatch->n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); + GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing + + int32_t * data = (int32_t *) pos_bucket->data; + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + for (int i = 0; i < n_tokens; ++i) { + data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true); + } + } + } + } +} + +class llama_graph_input_out_ids : public llama_graph_input_i { +public: + llama_graph_input_out_ids( + const llama_hparams & hparams, + const llama_cparams & cparams, + int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {} + virtual ~llama_graph_input_out_ids() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * out_ids; // I32 [n_outputs] + + const llama_hparams & hparams; + const llama_cparams & cparams; + + const int32_t n_outputs; +}; + +void llama_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { + if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + //GGML_ASSERT(out_ids && "every model that can must skip unused outputs"); + + if (!out_ids) { + LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__); + } else { + const int64_t n_tokens = ubatch->n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer)); + int32_t * data = (int32_t *) out_ids->data; + + if (n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (ubatch->output) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (ubatch->output[i]) { + data[n_outputs++] = i; + } + } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(n_outputs == n_outputs); + } else if (n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(n_outputs == 0); + } + } + } +} + +class llama_graph_input_mean : public llama_graph_input_i { +public: + llama_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {} + virtual ~llama_graph_input_mean() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * mean; // F32 [n_batch, n_batch] + + const llama_cparams & cparams; +}; + +void llama_graph_input_mean::set_input(const llama_ubatch * ubatch) { + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(mean); + GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer)); + + float * data = (float *) mean->data; + memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean)); + + std::vector sum(n_tokens, 0); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); + + sum[seq_id] += ubatch->n_seq_tokens; + } + + std::vector div(n_tokens, 0.0f); + for (int i = 0; i < n_tokens; ++i) { + const uint64_t s = sum[i]; + if (s > 0) { + div[i] = 1.0f/float(s); + } + } + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + for (int i = 0; i < n_seq_tokens; ++i) { + data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; + } + } + } +} + +class llama_graph_input_cls : public llama_graph_input_i { +public: + llama_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {} + virtual ~llama_graph_input_cls() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * cls; // I32 [n_batch] + + const llama_cparams & cparams; +}; + +void llama_graph_input_cls::set_input(const llama_ubatch * ubatch) { + if (cparams.embeddings && ( + cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || + cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(cls); + GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); + + uint32_t * data = (uint32_t *) cls->data; + memset(cls->data, 0, n_tokens * ggml_element_size(cls)); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; + + if (pos == 0) { + data[seq_id] = s*n_seq_tokens + i; + } + } + } + } + + if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { + const int64_t n_tokens = ubatch->n_tokens; + const int64_t n_seq_tokens = ubatch->n_seq_tokens; + const int64_t n_seqs = ubatch->n_seqs; + + GGML_ASSERT(cls); + GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer)); + + uint32_t * data = (uint32_t *) cls->data; + memset(cls->data, 0, n_tokens * ggml_element_size(cls)); + + std::vector last_pos(n_tokens, -1); + std::vector last_row(n_tokens, -1); + + for (int s = 0; s < n_seqs; ++s) { + const llama_seq_id seq_id = ubatch->seq_id[s][0]; + + // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true + GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); + + for (int i = 0; i < n_seq_tokens; ++i) { + const llama_pos pos = ubatch->pos[s*n_seq_tokens + i]; + + if (pos >= last_pos[seq_id]) { + last_pos[seq_id] = pos; + last_row[seq_id] = s*n_seq_tokens + i; + } + } + } + + for (int i = 0; i < n_tokens; ++i) { + if (last_row[i] >= 0) { + data[i] = last_row[i]; + } + } + } +} + class llama_graph_input_attn_base : public llama_graph_input_attn_i { public: llama_graph_input_attn_base(const llama_hparams & hparams, const llama_cparams & cparams) : @@ -846,7 +1083,6 @@ int llama_context_base::encode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove, tmp here, until all inputs are migrated outside the context const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { @@ -1003,7 +1239,6 @@ int llama_context_base::decode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { @@ -1132,178 +1367,6 @@ int64_t llama_context_base::n_pos_per_token() const { return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; } -void llama_context_base::input_set(const llama_ubatch & ubatch) { - const llama_hparams & hparams = model.hparams; - - if (ubatch.pos && inp.pos) { - const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(inp.pos, ubatch.pos, 0, n_tokens*n_pos_per_token()*ggml_element_size(inp.pos)); - } - - if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - //GGML_ASSERT(inp.out_ids && "every model that can must skip unused outputs"); - - if (!inp.out_ids) { - LLAMA_LOG_WARN("%s: 'inp.out_ids' is not created\n", __func__); - } else { - const int64_t n_tokens = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp.out_ids->buffer)); - int32_t * data = (int32_t *) inp.out_ids->data; - - if (n_outputs == n_tokens) { - for (int i = 0; i < n_tokens; ++i) { - data[i] = i; - } - } else if (ubatch.output) { - int32_t n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - if (ubatch.output[i]) { - data[n_outputs++] = i; - } - } - // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(n_outputs == n_outputs); - } else if (n_outputs == 1) { - // only keep last output - data[0] = n_tokens - 1; - } else { - GGML_ASSERT(n_outputs == 0); - } - } - } - - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp.mean); - GGML_ASSERT(ggml_backend_buffer_is_host(inp.mean->buffer)); - - float * data = (float *) inp.mean->data; - memset(inp.mean->data, 0, n_tokens * n_tokens * ggml_element_size(inp.mean)); - - std::vector sum(n_tokens, 0); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); - - sum[seq_id] += ubatch.n_seq_tokens; - } - - std::vector div(n_tokens, 0.0f); - for (int i = 0; i < n_tokens; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); - } - } - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - for (int i = 0; i < n_seq_tokens; ++i) { - data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id]; - } - } - } - - if (cparams.embeddings && ( - cparams.pooling_type == LLAMA_POOLING_TYPE_CLS || - cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp.cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp.cls->buffer)); - - uint32_t * data = (uint32_t *) inp.cls->data; - memset(inp.cls->data, 0, n_tokens * ggml_element_size(inp.cls)); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; - - if (pos == 0) { - data[seq_id] = s*n_seq_tokens + i; - } - } - } - } - - if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) { - const int64_t n_tokens = ubatch.n_tokens; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_seqs = ubatch.n_seqs; - - GGML_ASSERT(inp.cls); - GGML_ASSERT(ggml_backend_buffer_is_host(inp.cls->buffer)); - - uint32_t * data = (uint32_t *) inp.cls->data; - memset(inp.cls->data, 0, n_tokens * ggml_element_size(inp.cls)); - - std::vector last_pos(n_tokens, -1); - std::vector last_row(n_tokens, -1); - - for (int s = 0; s < n_seqs; ++s) { - const llama_seq_id seq_id = ubatch.seq_id[s][0]; - - // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true - GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST"); - - for (int i = 0; i < n_seq_tokens; ++i) { - const llama_pos pos = ubatch.pos[s*n_seq_tokens + i]; - - if (pos >= last_pos[seq_id]) { - last_pos[seq_id] = pos; - last_row[seq_id] = s*n_seq_tokens + i; - } - } - } - - for (int i = 0; i < n_tokens; ++i) { - if (last_row[i] >= 0) { - data[i] = last_row[i]; - } - } - } - - if (inp.pos_bucket) { - const int64_t n_tokens = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp.pos_bucket->buffer)); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - - int32_t * data = (int32_t *) inp.pos_bucket->data; - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - for (int i = 0; i < n_tokens; ++i) { - data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, true); - } - } - } - } - - GGML_ASSERT( - // (!a || b) is a logical implication (a -> b) - // !hparams.causal_attn -> !cparams.causal_attn - (hparams.causal_attn || !cparams.causal_attn) && - "causal attention is not supported by this model" - ); -} - // // output // @@ -1423,8 +1486,6 @@ int32_t llama_context_base::graph_max_nodes() const { } ggml_cgraph * llama_context_base::graph_init() { - inp = {}; - struct ggml_init_params params = { /*.mem_size =*/ buf_compute_meta.size(), /*.mem_buffer =*/ buf_compute_meta.data(), @@ -1478,7 +1539,7 @@ void llama_context_base::build_cb( ggml_tensor * cur, const char * name, const llama_ubatch & ubatch, - int il) { + int il) const { if (il >= 0) { ggml_format_name(cur, "%s-%d", name, il); } else { @@ -1498,7 +1559,7 @@ void llama_context_base::build_cb( if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = model.dev_layer(il); - for (auto & backend : backends) { + for (const auto & backend : backends) { if (ggml_backend_get_device(backend.get()) == dev_layer) { if (ggml_backend_supports_op(backend.get(), cur)) { ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get()); @@ -1512,14 +1573,14 @@ void llama_context_base::build_cb( ggml_tensor * llama_context_base::build_cvec( ggml_context * ctx0, ggml_tensor * cur, - int il) { + int il) const { return cvec.apply_to(ctx0, cur, il); } ggml_tensor * llama_context_base::build_lora_mm( ggml_context * ctx0, ggml_tensor * w, - ggml_tensor * cur) { + ggml_tensor * cur) const { struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); for (const auto & lora : loras) { @@ -1547,7 +1608,7 @@ ggml_tensor * llama_context_base::build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, ggml_tensor * cur, - ggml_tensor * ids) { + ggml_tensor * ids) const { struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); for (const auto & lora : loras) { struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); @@ -1572,7 +1633,7 @@ ggml_tensor * llama_context_base::build_lora_mm_id( return res; } -ggml_tensor * llama_context_base::build_rope_factors(int il) { +ggml_tensor * llama_context_base::build_rope_factors(int il) const { const auto & hparams = model.hparams; // choose long/short freq factors based on the context size @@ -1594,7 +1655,7 @@ ggml_tensor * llama_context_base::build_rope_shift( ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, - ggml_backend_buffer * bbuf) { + ggml_backend_buffer * bbuf) const { const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; const auto & freq_base = cparams.rope_freq_base; const auto & freq_scale = cparams.rope_freq_scale; @@ -1614,7 +1675,7 @@ ggml_tensor * llama_context_base::build_rope_shift( tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); if (bbuf) { - for (auto & backend : backends) { + for (const auto & backend : backends) { // Figure out which backend KV cache belongs to if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) { ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); @@ -1694,49 +1755,72 @@ ggml_tensor * llama_context_base::build_inp_embd( } ggml_tensor * llama_context_base::build_inp_pos( - ggml_context * ctx0, - int32_t n_tokens) { - inp.pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); - ggml_set_input(inp.pos); + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens) const { + auto inp = std::make_shared(n_pos_per_token()); - return inp.pos; + inp->pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + ggml_set_input(inp->pos); + + res->add_input(inp); + + return inp->pos; } ggml_tensor * llama_context_base::build_inp_pos_bucket( - ggml_context * ctx0, - int32_t n_tokens) { - inp.pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); - ggml_set_input(inp.pos_bucket); + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens) const { + auto inp = std::make_shared(model.hparams); - return inp.pos_bucket; + inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens); + ggml_set_input(inp->pos_bucket); + + res->add_input(inp); + + return inp->pos_bucket; } ggml_tensor * llama_context_base::build_inp_out_ids( - ggml_context * ctx0) { - const int32_t n_out_ids = n_outputs; + llama_graph_result * res, + ggml_context * ctx0) const { + auto inp = std::make_shared(model.hparams, cparams, n_outputs); - inp.out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_out_ids); - ggml_set_input(inp.out_ids); + inp->out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); + ggml_set_input(inp->out_ids); - return inp.out_ids; + res->add_input(inp); + + return inp->out_ids; } ggml_tensor * llama_context_base::build_inp_mean( - ggml_context * ctx0, - int32_t n_tokens) { - inp.mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); - ggml_set_input(inp.mean); + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens) const { + auto inp = std::make_shared(cparams); - return inp.mean; + inp->mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); + ggml_set_input(inp->mean); + + res->add_input(inp); + + return inp->mean; } ggml_tensor * llama_context_base::build_inp_cls( - ggml_context * ctx0, - int32_t n_tokens) { - inp.cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_input(inp.cls); + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens) const { + auto inp = std::make_shared(cparams); - return inp.cls; + inp->cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->cls); + + res->add_input(inp); + + return inp->cls; } llama_graph_input_attn_ptr llama_context_base::build_attn_inp( @@ -1887,33 +1971,6 @@ ggml_tensor * llama_context_base::build_attn_mha( return cur; } -ggml_tensor * llama_context_base::build_inp_self_k_shift( - ggml_context * ctx0) { - GGML_UNUSED(ctx0); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); - return nullptr; -} - -void llama_context_base::build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * gf) { - GGML_UNUSED(ctx0); - GGML_UNUSED(gf); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); -} - -void llama_context_base::build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * gf) { - GGML_UNUSED(ctx0); - GGML_UNUSED(gf); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); -} - - // // perf // @@ -2428,6 +2485,68 @@ size_t llama_context_base::state_seq_read_data(llama_io_read_i & io, llama_seq_i // llama_context_kv_self // +class llama_graph_input_pos_bucket_kv : public llama_graph_input_i { +public: + llama_graph_input_pos_bucket_kv( + const llama_hparams & hparams, + const llama_kv_cache_unified * kv_self) : hparams(hparams), kv_self(kv_self) {} + virtual ~llama_graph_input_pos_bucket_kv() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] + + const llama_hparams & hparams; + const llama_kv_cache_unified * kv_self; +}; + +void llama_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) { + if (pos_bucket) { + const int64_t n_tokens = ubatch->n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer)); + GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing + + int32_t * data = (int32_t *) pos_bucket->data; + + const int64_t n_kv = kv_self->n; + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + for (int i = 0; i < n_kv; ++i) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false); + } + } + } + } +} + +class llama_graph_input_k_shift : public llama_graph_input_i { +public: + llama_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {} + virtual ~llama_graph_input_k_shift() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * k_shift; // I32 [kv_size] + + const llama_kv_cache_unified * kv_self; +}; + +void llama_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { + GGML_UNUSED(ubatch); + + if (k_shift) { + assert(ggml_backend_buffer_is_host(k_shift->buffer)); + + int32_t * data = (int32_t *) k_shift->data; + + for (uint32_t i = 0; i < kv_self->size; ++i) { + data[i] = kv_self->cells[i].delta; + } + } +} + class llama_graph_input_attn_kv_self : public llama_graph_input_attn_i { public: llama_graph_input_attn_kv_self( @@ -2661,11 +2780,11 @@ void llama_context_kv_self::kv_self_update() { auto * gf = graph_init(); - build_kv_self_shift(ctx_compute.get(), gf); + auto res = graph_build_kv_self_shift(ctx_compute.get(), gf); ggml_backend_sched_alloc_graph(sched.get(), gf); - input_set({}); + res->set_inputs(nullptr); graph_compute(gf, false); @@ -2689,7 +2808,7 @@ void llama_context_kv_self::kv_self_update() { auto * gf = graph_init(); - build_kv_self_defrag(ctx_compute.get(), gf); + graph_build_kv_self_defrag(ctx_compute.get(), gf); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2792,7 +2911,6 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { @@ -3031,7 +3149,6 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { @@ -3190,66 +3307,24 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { return 0; } -void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { - const llama_hparams & hparams = model.hparams; - - if (inp.self_k_shift) { - assert(ggml_backend_buffer_is_host(inp.self_k_shift->buffer)); - - int32_t * data = (int32_t *) inp.self_k_shift->data; - - for (uint32_t i = 0; i < kv_self->size; ++i) { - data[i] = kv_self->cells[i].delta; - } - - // the K-shift graph requires just this input - return; - } - - // call base functionality - llama_context_base::input_set(ubatch); - - if (inp.self_pos_bucket) { - const int64_t n_tokens = ubatch.n_tokens; - - GGML_ASSERT(ggml_backend_buffer_is_host(inp.self_pos_bucket->buffer)); - GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing - - int32_t * data = (int32_t *) inp.self_pos_bucket->data; - - const int64_t n_kv = kv_self->n; - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - for (int i = 0; i < n_kv; ++i) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, false); - } - } - } - } -} - ggml_cgraph * llama_context_kv_self::graph_init() { - inp = {}; - return llama_context_base::graph_init(); } -ggml_tensor * llama_context_kv_self::build_inp_self_k_shift(ggml_context * ctx0) { - inp.self_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); - ggml_set_input(inp.self_k_shift); - - return inp.self_k_shift; -} - ggml_tensor * llama_context_kv_self::build_inp_pos_bucket( - ggml_context * ctx0, - int32_t n_tokens) { + llama_graph_result * res, + ggml_context * ctx0, + int32_t n_tokens) const { + auto inp = std::make_shared(model.hparams, kv_self.get()); + const auto n_kv = kv_self->n; - inp.self_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); - ggml_set_input(inp.self_pos_bucket); + inp->pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens); + ggml_set_input(inp->pos_bucket); - return inp.self_pos_bucket; + res->inputs.push_back(inp); + + return inp->pos_bucket; } llama_graph_input_attn_ptr llama_context_kv_self::build_attn_inp( @@ -3404,9 +3479,11 @@ ggml_tensor * llama_context_kv_self::build_attn( return cur; } -void llama_context_kv_self::build_kv_self_shift( +llama_graph_result_ptr llama_context_kv_self::graph_build_kv_self_shift( ggml_context * ctx0, - ggml_cgraph * gf) { + ggml_cgraph * gf) const { + auto res = std::make_unique(); + const auto & hparams = model.hparams; const auto & n_layer = hparams.n_layer; @@ -3416,7 +3493,12 @@ void llama_context_kv_self::build_kv_self_shift( //GGML_ASSERT(kv_self->size == n_ctx); - ggml_tensor * inp_self_k_shift = build_inp_self_k_shift(ctx0); + auto inp = std::make_shared(kv_self.get()); + + inp->k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); + ggml_set_input(inp->k_shift); + + res->add_input(inp); for (uint32_t il = 0; il < n_layer; ++il) { const int64_t n_head_kv = hparams.n_head_kv(il); @@ -3431,15 +3513,17 @@ void llama_context_kv_self::build_kv_self_shift( ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), 0); - ggml_tensor * cur = build_rope_shift(ctx0, k, inp_self_k_shift, rope_factors, kv_self->k_l[il]->buffer); + ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, kv_self->k_l[il]->buffer); ggml_build_forward_expand(gf, cur); } + + return res; } -void llama_context_kv_self::build_kv_self_defrag( +llama_graph_result_ptr llama_context_kv_self::graph_build_kv_self_defrag( ggml_context * ctx0, - ggml_cgraph * gf) { + ggml_cgraph * gf) const { const auto & hparams = model.hparams; const uint32_t n_layer = hparams.n_layer; @@ -3454,7 +3538,7 @@ void llama_context_kv_self::build_kv_self_defrag( // number of cells moved uint32_t n_moves = 0; - // each move requires 6*n_layer tensors (see build_kv_self_defrag) + // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag) // - source view, destination view, copy operation // - x2 for keys and values //const uint32_t max_moves = max_nodes()/(6*n_layer); @@ -3565,7 +3649,7 @@ void llama_context_kv_self::build_kv_self_defrag( } if (n_moves == 0) { - return; + return nullptr; } //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); @@ -3705,6 +3789,8 @@ void llama_context_kv_self::build_kv_self_defrag( //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); #endif + + return nullptr; } // state save/load @@ -3747,6 +3833,89 @@ size_t llama_context_kv_self::state_seq_read_data(llama_io_read_i & io, llama_se // llama_context_recurrent // +class llama_graph_input_s_copy : public llama_graph_input_i { +public: + llama_graph_input_s_copy(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} + virtual ~llama_graph_input_s_copy() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * s_copy; // I32 [kv_size] + + llama_kv_cache_recurrent * kv_self; +}; + +void llama_graph_input_s_copy::set_input(const llama_ubatch * ubatch) { + GGML_UNUSED(ubatch); + + const int64_t n_kv = kv_self->n; + + if (s_copy) { + GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer)); + int32_t * data = (int32_t *) s_copy->data; + + // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t cell_id = i + kv_self->head; + llama_kv_cell & kv_cell = kv_self->cells[cell_id]; + + ////////////////////////////////////////////// + // TODO: this should not mutate the KV cache ! + + // prevent out-of-bound sources + if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) { + kv_cell.src = cell_id; + } + + data[i] = kv_cell.src; + + // TODO: do not mutate the KV cache + // ensure copy only happens once + if (kv_cell.src != (int32_t) cell_id) { + kv_cell.src = cell_id; + } + } + } +} + +class llama_graph_input_s_mask : public llama_graph_input_i { +public: + llama_graph_input_s_mask(llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {} + virtual ~llama_graph_input_s_mask() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * s_mask; // F32 [1, n_kv] + + llama_kv_cache_recurrent * kv_self; +}; + +void llama_graph_input_s_mask::set_input(const llama_ubatch * ubatch) { + GGML_UNUSED(ubatch); + + const int64_t n_kv = kv_self->n; + + if (s_mask) { + GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer)); + float * data = (float *) s_mask->data; + + // clear unused states + for (int i = 0; i < n_kv; ++i) { + const uint32_t cell_id = i + kv_self->head; + llama_kv_cell & kv_cell = kv_self->cells[cell_id]; + + data[i] = (float) (kv_cell.src >= 0); + + ////////////////////////////////////////////// + // TODO: this should not mutate the KV cache ! + // only clear once + if (kv_cell.src < 0) { + kv_cell.src = cell_id; + } + } + } +} + llama_context_recurrent::llama_context_recurrent( const llama_model & model, llama_context_params params, @@ -3985,7 +4154,6 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { @@ -4130,85 +4298,40 @@ int llama_context_recurrent::decode(llama_batch & inp_batch) { return 0; } -void llama_context_recurrent::input_set(const llama_ubatch & ubatch) { - // call base functionality - llama_context_base::input_set(ubatch); - - GGML_ASSERT(kv_self->recurrent); - - const int64_t n_kv = kv_self->n; - - if (inp.s_mask) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_mask->buffer)); - float * data = (float *) inp.s_mask->data; - - // clear unused states - for (int i = 0; i < n_kv; ++i) { - const uint32_t cell_id = i + kv_self->head; - llama_kv_cell & kv_cell = kv_self->cells[cell_id]; - - data[i] = (float) (kv_cell.src >= 0); - - // TODO: do not mutate the KV cache - // only clear once - if (kv_cell.src < 0) { - kv_cell.src = cell_id; - } - } - } - - if (inp.s_copy) { - GGML_ASSERT(ggml_backend_buffer_is_host(inp.s_copy->buffer)); - int32_t * data = (int32_t *) inp.s_copy->data; - - // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t cell_id = i + kv_self->head; - llama_kv_cell & kv_cell = kv_self->cells[cell_id]; - - // prevent out-of-bound sources - if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) { - kv_cell.src = cell_id; - } - - data[i] = kv_cell.src; - - // TODO: do not mutate the KV cache - // ensure copy only happens once - if (kv_cell.src != (int32_t) cell_id) { - kv_cell.src = cell_id; - } - } - } -} - ggml_cgraph * llama_context_recurrent::graph_init() { - inp.s_copy = nullptr; - inp.s_mask = nullptr; - return llama_context_base::graph_init(); } ggml_tensor * llama_context_recurrent::build_inp_s_copy( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + auto inp = std::make_shared(kv_self.get()); + const auto n_kv = kv_self->n; - inp.s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); + inp->s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv); //cb(inp.s_copy, "inp_s_copy", -1); - ggml_set_input(inp.s_copy); + ggml_set_input(inp->s_copy); - return inp.s_copy; + res->add_input(inp); + + return inp->s_copy; } ggml_tensor * llama_context_recurrent::build_inp_s_mask( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + auto inp = std::make_shared(kv_self.get()); + const auto n_kv = kv_self->n; - inp.s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); - //cb(inp.s_mask, "inp_s_mask", -1); - ggml_set_input(inp.s_mask); + inp->s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv); + //cb(inp->s_mask, "inp_s_mask", -1); + ggml_set_input(inp->s_mask); - return inp.s_mask; + res->add_input(inp); + + return inp->s_mask; } ggml_tensor * llama_context_recurrent::build_copy_mask_state( @@ -4218,7 +4341,7 @@ ggml_tensor * llama_context_recurrent::build_copy_mask_state( ggml_tensor * state_copy, ggml_tensor * state_mask, int32_t n_state, - int32_t n_seqs) { + int32_t n_seqs) const { const auto n_kv = kv_self->n; const auto kv_head = kv_self->head; @@ -4251,7 +4374,7 @@ ggml_tensor * llama_context_recurrent::build_mamba_layer( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) { + int il) const { const auto & hparams = model.hparams; const auto kv_head = kv_self->head; @@ -4383,7 +4506,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_load( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) { + int il) const { const auto & hparams = model.hparams; const auto token_shift_count = hparams.token_shift_count; @@ -4405,7 +4528,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, - int il) { + int il) const { const auto & hparams = model.hparams; const auto token_shift_count = hparams.token_shift_count; @@ -4430,7 +4553,7 @@ ggml_tensor * llama_context_recurrent::build_rwkv6_time_mix( ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) { + int il) const { const auto & hparams = model.hparams; const auto n_tokens = ubatch.n_tokens; @@ -4693,7 +4816,6 @@ int llama_context_enc::encode(llama_batch & inp_batch) { ggml_backend_sched_alloc_graph(sched.get(), gf); res->set_inputs(&ubatch); - input_set(ubatch); // TODO: remove const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { @@ -4782,6 +4904,29 @@ int llama_context_enc::encode(llama_batch & inp_batch) { // llama_context_dec // +class llama_graph_input_cross_embd : public llama_graph_input_i { +public: + llama_graph_input_cross_embd( + const llama_cross * cross) : cross(cross) {} + virtual ~llama_graph_input_cross_embd() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc] + + const llama_cross * cross; +}; + +void llama_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) { + GGML_UNUSED(ubatch); + + if (cross_embd && cross->t_embd) { + assert(cross_embd->type == GGML_TYPE_F32); + + ggml_backend_tensor_set(cross_embd, cross->v_embd, 0, ggml_nbytes(cross_embd)); + } +} + class llama_graph_input_attn_dec : public llama_graph_input_attn_i { public: llama_graph_input_attn_dec( @@ -4841,32 +4986,21 @@ void llama_context_dec::reserve() { llama_context_kv_self::reserve(); } -void llama_context_dec::input_set(const llama_ubatch & ubatch) { - // call base functionality - llama_context_kv_self::input_set(ubatch); - - if (inp.cross_embd && cross->t_embd) { - assert(inp.cross_embd->type == GGML_TYPE_F32); - - ggml_backend_tensor_set(inp.cross_embd, cross->v_embd, 0, ggml_nbytes(inp.cross_embd)); - } - -} - ggml_cgraph * llama_context_dec::graph_init() { - inp = {}; - return llama_context_kv_self::graph_init(); } ggml_tensor * llama_context_dec::build_inp_cross_embd( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + auto inp = std::make_shared(cross); + // if we have the output embeddings from the encoder, use them directly // TODO: needs more work to be correct, for now just use the tensor shape //if (cross->t_embd) { - // inp.cross_embd = ggml_view_tensor(ctx0, cross->t_embd); + // inp->cross_embd = ggml_view_tensor(ctx0, cross->t_embd); - // return inp.cross_embd; + // return inp->cross_embd; //} const auto & hparams = model.hparams; @@ -4874,10 +5008,12 @@ ggml_tensor * llama_context_dec::build_inp_cross_embd( const auto n_embd = cross->t_embd ? cross->t_embd->ne[0] : hparams.n_embd; const auto n_enc = cross->t_embd ? cross->t_embd->ne[1] : hparams.n_ctx_train; - inp.cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); - ggml_set_input(inp.cross_embd); + inp->cross_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc); + ggml_set_input(inp->cross_embd); - return inp.cross_embd; + res->add_input(inp); + + return inp->cross_embd; } llama_graph_input_attn_ptr llama_context_dec::build_attn_inp( diff --git a/src/llama-context.h b/src/llama-context.h index 0f248537ed..21015e8796 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -248,24 +248,6 @@ protected: virtual int64_t n_pos_per_token() const; // vision - // when the compute graph is built, it creates the input tensors that it needs - // the contents of the input tensors are set by the input_set() function - - // TODO: remove, replace by llama_graph_input_i->set_input() - virtual void input_set(const llama_ubatch & ubatch); - -private: - // TODO: remove, implement as llama_graph_input_xxx - struct { - // base input tensors - ggml_tensor * pos; // I32 [n_batch] - ggml_tensor * pos_bucket; // I32 [n_batch, n_batch] - ggml_tensor * out_ids; // I32 [n_outputs] - ggml_tensor * mean; // F32 [n_batch, n_batch] - ggml_tensor * cls; // I32 [n_batch] - } inp; - -protected: // // output // @@ -309,35 +291,35 @@ public: ggml_tensor * cur, const char * name, const llama_ubatch & ubatch, - int il) override; + int il) const override; // apply control vector for layer il ggml_tensor * build_cvec( ggml_context * ctx0, ggml_tensor * cur, - int il) override; + int il) const override; // do mat_mul, while optionally apply lora ggml_tensor * build_lora_mm( ggml_context * ctx0, ggml_tensor * w, - ggml_tensor * cur) override; + ggml_tensor * cur) const override; // do mat_mul_id, while optionally apply lora ggml_tensor * build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, // struct ggml_tensor * as ggml_tensor * cur, // struct ggml_tensor * b - ggml_tensor * ids) override; + ggml_tensor * ids) const override; - ggml_tensor * build_rope_factors(int il) override; + ggml_tensor * build_rope_factors(int il) const override; ggml_tensor * build_rope_shift( ggml_context * ctx0, ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, - ggml_backend_buffer * bbuf) override; + ggml_backend_buffer * bbuf) const override; ggml_tensor * build_inp_embd( llama_graph_result * res, @@ -346,23 +328,28 @@ public: const llama_ubatch & ubatch) const override; ggml_tensor * build_inp_pos( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) override; + int32_t n_tokens) const override; ggml_tensor * build_inp_pos_bucket( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) override; + int32_t n_tokens) const override; ggml_tensor * build_inp_out_ids( - ggml_context * ctx0) override; + llama_graph_result * res, + ggml_context * ctx0) const override; ggml_tensor * build_inp_mean( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) override; + int32_t n_tokens) const override; ggml_tensor * build_inp_cls( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) override; + int32_t n_tokens) const override; llama_graph_input_attn_ptr build_attn_inp( llama_graph_result * res, @@ -394,18 +381,6 @@ protected: bool v_trans, float kq_scale) const; - virtual ggml_tensor * build_inp_self_k_shift( - ggml_context * ctx0); - - virtual void build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * gf); - - // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * gf); - public: // // perf @@ -552,19 +527,6 @@ public: int encode(llama_batch & inp_batch) override; int decode(llama_batch & inp_batch) override; -protected: - // - // input - // - - void input_set(const llama_ubatch & ubatch) override; - -private: - struct { - ggml_tensor * self_pos_bucket; // I32 [n_kv, n_batch] - ggml_tensor * self_k_shift; // I32 [kv_size] - } inp; - protected: // // graph @@ -578,8 +540,9 @@ public: // ggml_tensor * build_inp_pos_bucket( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) override; + int32_t n_tokens) const override; llama_graph_input_attn_ptr build_attn_inp( llama_graph_result * res, @@ -600,16 +563,14 @@ public: int il) const override; protected: - ggml_tensor * build_inp_self_k_shift(ggml_context * ctx0) override; - - void build_kv_self_shift( + llama_graph_result_ptr graph_build_kv_self_shift( ggml_context * ctx0, - ggml_cgraph * gf) override; + ggml_cgraph * gf) const; // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - void build_kv_self_defrag( + llama_graph_result_ptr graph_build_kv_self_defrag( ggml_context * ctx0, - ggml_cgraph * gf) override; + ggml_cgraph * gf) const; // // state save/load @@ -651,19 +612,6 @@ public: int encode(llama_batch & inp_batch) override; int decode(llama_batch & inp_batch) override; -protected: - // - // input - // - - void input_set(const llama_ubatch & ubatch) override; - -private: - struct { - ggml_tensor * s_copy; // I32 [kv_size] - ggml_tensor * s_mask; // F32 [1, n_kv] - } inp; - protected: // // graph @@ -677,10 +625,12 @@ public: // ggml_tensor * build_inp_s_copy( - ggml_context * ctx0) override; + llama_graph_result * res, + ggml_context * ctx0) const override; ggml_tensor * build_inp_s_mask( - ggml_context * ctx0) override; + llama_graph_result * res, + ggml_context * ctx0) const override; ggml_tensor * build_copy_mask_state( ggml_context * ctx0, @@ -689,7 +639,7 @@ public: ggml_tensor * state_copy, ggml_tensor * state_mask, int32_t n_state, - int32_t n_seqs) override; + int32_t n_seqs) const override; ggml_tensor * build_mamba_layer( ggml_context * ctx0, @@ -698,7 +648,7 @@ public: ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) override; + int il) const override; ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, @@ -706,13 +656,13 @@ public: ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) override; + int il) const override; ggml_tensor * build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, - int il) override; + int il) const override; ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, @@ -722,7 +672,7 @@ public: ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il) override; + int il) const override; protected: // @@ -774,18 +724,6 @@ public: protected: void reserve() override; - // - // input - // - - void input_set(const llama_ubatch & ubatch) override; - -private: - struct { - ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc] - } inp; - -protected: // // graph // @@ -793,7 +731,8 @@ protected: ggml_cgraph * graph_init() override; ggml_tensor * build_inp_cross_embd( - ggml_context * ctx0) override; + llama_graph_result * res, + ggml_context * ctx0) const override; llama_graph_input_attn_ptr build_attn_inp( llama_graph_result * res, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 549a42c53b..79b26d1734 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -68,25 +68,19 @@ ggml_tensor * llama_graph_i::build_attn_cross( } ggml_tensor * llama_graph_i::build_inp_cross_embd( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); return nullptr; } -ggml_tensor * llama_graph_i::build_inp_cross_kq_mask( - ggml_context * ctx0, - int32_t n_tokens) { - GGML_UNUSED(ctx0); - GGML_UNUSED(n_tokens); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); - return nullptr; -} - ggml_tensor * llama_graph_i::build_inp_s_copy ( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -95,7 +89,9 @@ ggml_tensor * llama_graph_i::build_inp_s_copy ( } ggml_tensor * llama_graph_i::build_inp_s_mask( - ggml_context * ctx0) { + llama_graph_result * res, + ggml_context * ctx0) const { + GGML_UNUSED(res); GGML_UNUSED(ctx0); LLAMA_LOG_ERROR("%s: not implemented\n", __func__); @@ -104,13 +100,13 @@ ggml_tensor * llama_graph_i::build_inp_s_mask( } ggml_tensor * llama_graph_i::build_copy_mask_state( - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * s, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - int32_t n_state, - int32_t n_seqs) { + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * s, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + int32_t n_state, + int32_t n_seqs) const { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(s); @@ -125,13 +121,13 @@ ggml_tensor * llama_graph_i::build_copy_mask_state( } ggml_tensor * llama_graph_i::build_mamba_layer( - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * cur, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - const llama_ubatch & ubatch, - int il) { + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * cur, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il) const { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(cur); @@ -146,12 +142,12 @@ ggml_tensor * llama_graph_i::build_mamba_layer( } ggml_tensor * llama_graph_i::build_rwkv_token_shift_load( - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - const llama_ubatch & ubatch, - int il) { + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il) const { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(state_copy); @@ -165,10 +161,10 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_load( } ggml_tensor * llama_graph_i::build_rwkv_token_shift_store( - ggml_context * ctx0, - ggml_tensor * token_shift, - const llama_ubatch & ubatch, - int il) { + ggml_context * ctx0, + ggml_tensor * token_shift, + const llama_ubatch & ubatch, + int il) const { GGML_UNUSED(ctx0); GGML_UNUSED(token_shift); GGML_UNUSED(ubatch); @@ -180,14 +176,14 @@ ggml_tensor * llama_graph_i::build_rwkv_token_shift_store( } ggml_tensor * llama_graph_i::build_rwkv6_time_mix( - ggml_context * ctx0, - ggml_cgraph * gf, - ggml_tensor * cur, - ggml_tensor * x_prev, - ggml_tensor * state_copy, - ggml_tensor * state_mask, - const llama_ubatch & ubatch, - int il) { + ggml_context * ctx0, + ggml_cgraph * gf, + ggml_tensor * cur, + ggml_tensor * x_prev, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + const llama_ubatch & ubatch, + int il) const { GGML_UNUSED(ctx0); GGML_UNUSED(gf); GGML_UNUSED(cur); diff --git a/src/llama-graph.h b/src/llama-graph.h index a6a9ef00ca..7ae99becc7 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -93,6 +93,7 @@ public: // // TODO: can become more granular in the future +// TODO: move all methods that do not require things from llama_context to llm_build_context class llama_graph_i { public: llama_graph_i(llama_graph_type type); @@ -109,28 +110,28 @@ public: ggml_tensor * cur, const char * name, const llama_ubatch & ubatch, - int il) = 0; + int il) const = 0; // apply control vector for layer il virtual ggml_tensor * build_cvec( ggml_context * ctx0, ggml_tensor * cur, - int il) = 0; + int il) const = 0; // do mat_mul, while optionally apply lora virtual ggml_tensor * build_lora_mm( ggml_context * ctx0, ggml_tensor * w, - ggml_tensor * cur) = 0; + ggml_tensor * cur) const = 0; // do mat_mul_id, while optionally apply lora virtual ggml_tensor * build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, // struct ggml_tensor * as ggml_tensor * cur, // struct ggml_tensor * b - ggml_tensor * ids) = 0; + ggml_tensor * ids) const = 0; - virtual ggml_tensor * build_rope_factors(int il) = 0; + virtual ggml_tensor * build_rope_factors(int il) const = 0; // note: optionally set the backend to be the same as the bbuf's backend virtual ggml_tensor * build_rope_shift( @@ -138,7 +139,7 @@ public: ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, - ggml_backend_buffer * bbuf) = 0; + ggml_backend_buffer * bbuf) const = 0; // graph build API (context-specific) @@ -146,26 +147,31 @@ public: llama_graph_result * res, ggml_context * ctx0, ggml_tensor * tok_embd, - const llama_ubatch & ubatch) const = 0; // note these methods will become const, i.e. they don't mutate the llama_context that implements them + const llama_ubatch & ubatch) const = 0; virtual ggml_tensor * build_inp_pos( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) = 0; + int32_t n_tokens) const = 0; virtual ggml_tensor * build_inp_pos_bucket( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) = 0; + int32_t n_tokens) const = 0; virtual ggml_tensor * build_inp_out_ids( - ggml_context * ctx0) = 0; + llama_graph_result * res, + ggml_context * ctx0) const = 0; virtual ggml_tensor * build_inp_mean( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) = 0; + int32_t n_tokens) const = 0; virtual ggml_tensor * build_inp_cls( + llama_graph_result * res, ggml_context * ctx0, - int32_t n_tokens) = 0; + int32_t n_tokens) const = 0; virtual llama_graph_input_attn_ptr build_attn_inp( llama_graph_result * res, @@ -197,17 +203,16 @@ public: int il) const; virtual ggml_tensor * build_inp_cross_embd( - ggml_context * ctx0); - - virtual ggml_tensor * build_inp_cross_kq_mask( - ggml_context * ctx0, - int32_t n_tokens); + llama_graph_result * res, + ggml_context * ctx0) const; virtual ggml_tensor * build_inp_s_copy( - ggml_context * ctx0); + llama_graph_result * res, + ggml_context * ctx0) const; virtual ggml_tensor * build_inp_s_mask( - ggml_context * ctx0); + llama_graph_result * res, + ggml_context * ctx0) const; virtual ggml_tensor * build_copy_mask_state( ggml_context * ctx0, @@ -216,7 +221,7 @@ public: ggml_tensor * state_copy, ggml_tensor * state_mask, int32_t n_state, - int32_t n_seqs); + int32_t n_seqs) const; virtual ggml_tensor * build_mamba_layer( ggml_context * ctx0, @@ -225,7 +230,7 @@ public: ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il); + int il) const; virtual ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, @@ -233,13 +238,13 @@ public: ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il); + int il) const; virtual ggml_tensor * build_rwkv_token_shift_store( ggml_context * ctx0, ggml_tensor * token_shift, const llama_ubatch & ubatch, - int il); + int il) const; virtual ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, @@ -249,5 +254,5 @@ public: ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, - int il); + int il) const; }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b6adbb1a1b..7fae82c6ec 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3910,7 +3910,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_pos() { - ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens); + ggml_tensor * cur = lgf->build_inp_pos(res.get(), ctx0, n_tokens); cb(cur, "inp_pos", -1); return cur; @@ -3918,7 +3918,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_out_ids() { - ggml_tensor * cur = lgf->build_inp_out_ids(ctx0); + ggml_tensor * cur = lgf->build_inp_out_ids(res.get(), ctx0); cb(cur, "inp_out_ids", -1); return cur; @@ -3926,7 +3926,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_mean() { - ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens); + ggml_tensor * cur = lgf->build_inp_mean(res.get(), ctx0, n_tokens); cb(cur, "inp_mean", -1); return cur; @@ -3934,7 +3934,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_cls() { - ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens); + ggml_tensor * cur = lgf->build_inp_cls(res.get(), ctx0, n_tokens); cb(cur, "inp_cls", -1); return cur; @@ -3957,7 +3957,7 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_pos_bucket() { - ggml_tensor * cur = lgf->build_inp_pos_bucket(ctx0, n_tokens); + ggml_tensor * cur = lgf->build_inp_pos_bucket(res.get(), ctx0, n_tokens); cb(cur, "pos_bucket", -1); return cur; @@ -3965,20 +3965,12 @@ struct llm_build_context { // TODO: tmp struct ggml_tensor * build_inp_cross_embd() { - ggml_tensor * cur = lgf->build_inp_cross_embd(ctx0); + ggml_tensor * cur = lgf->build_inp_cross_embd(res.get(), ctx0); cb(cur, "embd_enc", -1); return cur; } - // TODO: tmp - struct ggml_tensor * build_inp_cross_kq_mask() { - ggml_tensor * cur = lgf->build_inp_cross_kq_mask(ctx0, n_tokens); - cb(cur, "KQ_mask_cross", -1); - - return cur; - } - struct ggml_tensor * build_norm( struct ggml_tensor * cur, struct ggml_tensor * mw, @@ -3986,8 +3978,8 @@ struct llm_build_context { llm_norm_type type, int il) { switch (type) { - case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break; - case LLM_NORM_RMS: cur = ggml_rms_norm (ctx0, cur, hparams.f_norm_rms_eps); break; + case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm(ctx0, cur, hparams.f_norm_rms_eps); break; case LLM_NORM_GROUP: { cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]); @@ -8070,8 +8062,8 @@ struct llm_build_context { // {n_embd, n_tokens} inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); for (int il = 0; il < n_layer; ++il) { // norm @@ -10443,8 +10435,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens; @@ -10535,8 +10527,8 @@ struct llm_build_context { inpL = build_inp_embd(model.tok_embd); - struct ggml_tensor * state_copy = lgf->build_inp_s_copy(ctx0); - struct ggml_tensor * state_mask = lgf->build_inp_s_mask(ctx0); + struct ggml_tensor * state_copy = lgf->build_inp_s_copy(res.get(), ctx0); + struct ggml_tensor * state_mask = lgf->build_inp_s_mask(res.get(), ctx0); const auto n_embd = hparams.n_embd; const auto n_seq_tokens = ubatch.n_seq_tokens;