kv-cache : drop the "unified" prefix (#15467)

* kv-cache : drop the "unified" prefix

ggml-ci

* cont : fix comment [no ci]
This commit is contained in:
Georgi Gerganov
2025-08-21 17:00:33 +03:00
committed by GitHub
parent ad294df03f
commit 715a6db02c
15 changed files with 346 additions and 360 deletions

View File

@@ -6,8 +6,8 @@
#include "llama-cparams.h"
#include "llama-model-loader.h"
#include "llama-kv-cache-unified.h"
#include "llama-kv-cache-unified-iswa.h"
#include "llama-kv-cache.h"
#include "llama-kv-cache-iswa.h"
#include "llama-memory-hybrid.h"
#include "llama-memory-recurrent.h"
@@ -5986,7 +5986,7 @@ struct llm_build_llama : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@@ -6146,7 +6146,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
ggml_tensor * inp_attn_scale = nullptr;
inp_attn_scale = build_inp_attn_scale();
auto * inp_attn = build_attn_inp_kv_unified_iswa();
auto * inp_attn = build_attn_inp_kv_iswa();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@@ -6325,7 +6325,7 @@ struct llm_build_deci : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@@ -6481,7 +6481,7 @@ struct llm_build_baichuan : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -6603,7 +6603,7 @@ struct llm_build_xverse : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -6717,7 +6717,7 @@ struct llm_build_falcon : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -6841,7 +6841,7 @@ struct llm_build_grok : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -7001,7 +7001,7 @@ struct llm_build_dbrx : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -7125,7 +7125,7 @@ struct llm_build_starcoder : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
cb(pos, "pos_embd", -1);
@@ -7230,7 +7230,7 @@ struct llm_build_refact : public llm_graph_context {
inpL = build_inp_embd(model.tok_embd);
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -7632,7 +7632,7 @@ struct llm_build_bloom : public llm_graph_context {
inpL = build_inp_embd(model.tok_embd);
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
inpL = build_norm(inpL,
model.tok_norm,
@@ -7739,7 +7739,7 @@ struct llm_build_mpt : public llm_graph_context {
inpL = build_inp_embd(model.tok_embd);
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
if (model.pos_embd) {
// inp_pos - contains the positions
@@ -7889,7 +7889,7 @@ struct llm_build_stablelm : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -8041,7 +8041,7 @@ struct llm_build_qwen : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -8156,7 +8156,7 @@ struct llm_build_qwen2 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -8481,7 +8481,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@@ -8602,7 +8602,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -8761,7 +8761,7 @@ struct llm_build_qwen3 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -8882,7 +8882,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -9012,7 +9012,7 @@ struct llm_build_phi2 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -9141,13 +9141,13 @@ struct llm_build_phi3 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
inp_attn_type * inp_attn = nullptr;
if constexpr (iswa) {
inp_attn = build_attn_inp_kv_unified_iswa();
inp_attn = build_attn_inp_kv_iswa();
} else {
inp_attn = build_attn_inp_kv_unified();
inp_attn = build_attn_inp_kv();
}
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -9299,7 +9299,7 @@ struct llm_build_plamo : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -9415,7 +9415,7 @@ struct llm_build_gpt2 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
cb(pos, "pos_embd", -1);
@@ -9525,7 +9525,7 @@ struct llm_build_codeshell : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -9638,7 +9638,7 @@ struct llm_build_orion : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -9765,7 +9765,7 @@ struct llm_build_internlm2 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -9901,7 +9901,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -10096,7 +10096,7 @@ struct llm_build_gemma : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -10212,7 +10212,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified_iswa();
auto * inp_attn = build_attn_inp_kv_iswa();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -10346,7 +10346,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
ggml_tensor * inp_pos = build_inp_pos();
// TODO: is causal == true correct? might need some changes
auto * inp_attn = build_attn_inp_kv_unified_iswa();
auto * inp_attn = build_attn_inp_kv_iswa();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -10497,7 +10497,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
ggml_tensor * inp_pos = build_inp_pos();
// TODO: is causal == true correct? might need some changes
auto * inp_attn = build_attn_inp_kv_unified_iswa();
auto * inp_attn = build_attn_inp_kv_iswa();
// inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
@@ -10904,7 +10904,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -11473,7 +11473,7 @@ struct llm_build_command_r : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -11620,7 +11620,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified_iswa();
auto * inp_attn = build_attn_inp_kv_iswa();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -11755,7 +11755,7 @@ struct llm_build_olmo : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -11883,7 +11883,7 @@ struct llm_build_olmo2 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -12012,7 +12012,7 @@ struct llm_build_olmoe : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -12138,7 +12138,7 @@ struct llm_build_openelm : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -12269,7 +12269,7 @@ struct llm_build_gptneox : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -12415,7 +12415,7 @@ struct llm_build_arctic : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -12553,7 +12553,7 @@ struct llm_build_deepseek : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@@ -12730,7 +12730,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -12977,7 +12977,7 @@ struct llm_build_bitnet : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -13241,7 +13241,7 @@ struct llm_build_t5_dec : public llm_graph_context {
const int64_t n_outputs_enc = embd_enc->ne[1];
auto * inp_attn_self = build_attn_inp_kv_unified();
auto * inp_attn_self = build_attn_inp_kv();
auto * inp_attn_cross = build_attn_inp_cross();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -13406,7 +13406,7 @@ struct llm_build_jais : public llm_graph_context {
inpL = build_inp_embd(model.tok_embd);
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -13504,7 +13504,7 @@ struct llm_build_chatglm : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -13637,7 +13637,7 @@ struct llm_build_glm4 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -13787,7 +13787,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -13947,7 +13947,7 @@ struct llm_build_nemotron : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -14076,7 +14076,7 @@ struct llm_build_exaone : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -14208,13 +14208,13 @@ struct llm_build_exaone4 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
inp_attn_type * inp_attn = nullptr;
if constexpr (iswa) {
inp_attn = build_attn_inp_kv_unified_iswa();
inp_attn = build_attn_inp_kv_iswa();
} else {
inp_attn = build_attn_inp_kv_unified();
inp_attn = build_attn_inp_kv();
}
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -15097,7 +15097,7 @@ struct llm_build_granite : public llm_graph_context {
inp_pos = build_inp_pos();
}
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -15148,12 +15148,12 @@ struct llm_build_granite : public llm_graph_context {
}
ggml_tensor * build_attention_layer(
ggml_tensor * cur,
ggml_tensor * inp_pos,
llm_graph_input_attn_kv_unified * inp_attn,
const llama_model & model,
const int64_t n_embd_head,
const int il) {
ggml_tensor * cur,
ggml_tensor * inp_pos,
llm_graph_input_attn_kv * inp_attn,
const llama_model & model,
const int64_t n_embd_head,
const int il) {
// compute Q and K and (optionally) RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -15367,12 +15367,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
}
ggml_tensor * build_attention_layer(
ggml_tensor * cur,
ggml_tensor * inp_pos,
llm_graph_input_attn_kv_unified * inp_attn,
const llama_model & model,
const int64_t n_embd_head,
const int il) {
ggml_tensor * cur,
ggml_tensor * inp_pos,
llm_graph_input_attn_kv * inp_attn,
const llama_model & model,
const int64_t n_embd_head,
const int il) {
// compute Q and K and (optionally) RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -15529,7 +15529,7 @@ struct llm_build_chameleon : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -15860,7 +15860,7 @@ struct llm_build_plm : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -16025,7 +16025,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -16174,7 +16174,7 @@ struct llm_build_dots1 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -16324,7 +16324,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
@@ -16454,7 +16454,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -16828,7 +16828,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
private:
ggml_tensor * build_plamo2_attn_layer(
llm_graph_input_attn_kv_unified * inp,
llm_graph_input_attn_kv * inp,
ggml_tensor * inp_pos,
ggml_tensor * cur,
const llama_model & model,
@@ -17061,7 +17061,7 @@ struct llm_build_arcee : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@@ -17196,7 +17196,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
@@ -17357,7 +17357,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
@@ -17495,7 +17495,7 @@ struct llm_build_smollm3 : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
@@ -17627,7 +17627,7 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified_iswa();
auto * inp_attn = build_attn_inp_kv_iswa();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
@@ -17809,10 +17809,10 @@ struct llm_build_lfm2 : public llm_graph_context {
return cur;
}
ggml_tensor * build_attn_block(ggml_tensor * cur,
ggml_tensor * inp_pos,
llm_graph_input_attn_kv_unified * inp_attn,
int il) const {
ggml_tensor * build_attn_block(ggml_tensor * cur,
ggml_tensor * inp_pos,
llm_graph_input_attn_kv * inp_attn,
int il) const {
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
auto const n_embd_head = hparams.n_embd_head_v;
auto const n_head_kv = hparams.n_head_kv(il);
@@ -17940,13 +17940,13 @@ struct llm_build_smallthinker : public llm_graph_context{
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
inp_attn_type * inp_attn = nullptr;
if constexpr (iswa) {
inp_attn = build_attn_inp_kv_unified_iswa();
inp_attn = build_attn_inp_kv_iswa();
} else {
inp_attn = build_attn_inp_kv_unified();
inp_attn = build_attn_inp_kv();
}
ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -18076,7 +18076,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
std::max((uint32_t) 1, cparams.n_seq_max),
cparams.n_seq_max);
} else if (llm_arch_is_hybrid(arch)) {
const auto padding = llama_kv_cache_unified::get_padding(cparams);
const auto padding = llama_kv_cache::get_padding(cparams);
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
@@ -18098,7 +18098,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
} else {
const auto padding = llama_kv_cache_unified::get_padding(cparams);
const auto padding = llama_kv_cache::get_padding(cparams);
uint32_t n_ctx_per_stream = cparams.n_ctx;
@@ -18118,7 +18118,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
GGML_ASSERT(hparams.is_swa_any());
res = new llama_kv_cache_unified_iswa(
res = new llama_kv_cache_iswa(
*this,
params.type_k,
params.type_v,
@@ -18133,7 +18133,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
} else {
GGML_ASSERT(!hparams.is_swa_any());
res = new llama_kv_cache_unified(
res = new llama_kv_cache(
*this,
nullptr,
params.type_k,