mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	kv-cache : drop the "unified" prefix (#15467)
* kv-cache : drop the "unified" prefix ggml-ci * cont : fix comment [no ci]
This commit is contained in:
		| @@ -6,8 +6,8 @@ | ||||
| #include "llama-cparams.h" | ||||
| #include "llama-model-loader.h" | ||||
|  | ||||
| #include "llama-kv-cache-unified.h" | ||||
| #include "llama-kv-cache-unified-iswa.h" | ||||
| #include "llama-kv-cache.h" | ||||
| #include "llama-kv-cache-iswa.h" | ||||
| #include "llama-memory-hybrid.h" | ||||
| #include "llama-memory-recurrent.h" | ||||
|  | ||||
| @@ -5986,7 +5986,7 @@ struct llm_build_llama : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; | ||||
|  | ||||
| @@ -6146,7 +6146,7 @@ struct llm_build_llama_iswa : public llm_graph_context { | ||||
|         ggml_tensor * inp_attn_scale = nullptr; | ||||
|         inp_attn_scale = build_inp_attn_scale(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified_iswa(); | ||||
|         auto * inp_attn = build_attn_inp_kv_iswa(); | ||||
|  | ||||
|         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; | ||||
|  | ||||
| @@ -6325,7 +6325,7 @@ struct llm_build_deci : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; | ||||
|  | ||||
| @@ -6481,7 +6481,7 @@ struct llm_build_baichuan : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -6603,7 +6603,7 @@ struct llm_build_xverse : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -6717,7 +6717,7 @@ struct llm_build_falcon : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -6841,7 +6841,7 @@ struct llm_build_grok : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -7001,7 +7001,7 @@ struct llm_build_dbrx : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -7125,7 +7125,7 @@ struct llm_build_starcoder : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); | ||||
|         cb(pos, "pos_embd", -1); | ||||
| @@ -7230,7 +7230,7 @@ struct llm_build_refact : public llm_graph_context { | ||||
|  | ||||
|         inpL = build_inp_embd(model.tok_embd); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -7632,7 +7632,7 @@ struct llm_build_bloom : public llm_graph_context { | ||||
|  | ||||
|         inpL = build_inp_embd(model.tok_embd); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         inpL = build_norm(inpL, | ||||
|                 model.tok_norm, | ||||
| @@ -7739,7 +7739,7 @@ struct llm_build_mpt : public llm_graph_context { | ||||
|  | ||||
|         inpL = build_inp_embd(model.tok_embd); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         if (model.pos_embd) { | ||||
|             // inp_pos - contains the positions | ||||
| @@ -7889,7 +7889,7 @@ struct llm_build_stablelm : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -8041,7 +8041,7 @@ struct llm_build_qwen : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -8156,7 +8156,7 @@ struct llm_build_qwen2 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -8481,7 +8481,7 @@ struct llm_build_qwen2vl : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         int sections[4]; | ||||
|         std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); | ||||
| @@ -8602,7 +8602,7 @@ struct llm_build_qwen2moe : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -8761,7 +8761,7 @@ struct llm_build_qwen3 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -8882,7 +8882,7 @@ struct llm_build_qwen3moe : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -9012,7 +9012,7 @@ struct llm_build_phi2 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -9141,13 +9141,13 @@ struct llm_build_phi3 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>; | ||||
|         using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>; | ||||
|         inp_attn_type * inp_attn = nullptr; | ||||
|  | ||||
|         if constexpr (iswa) { | ||||
|             inp_attn = build_attn_inp_kv_unified_iswa(); | ||||
|             inp_attn = build_attn_inp_kv_iswa(); | ||||
|         } else { | ||||
|             inp_attn = build_attn_inp_kv_unified(); | ||||
|             inp_attn = build_attn_inp_kv(); | ||||
|         } | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
| @@ -9299,7 +9299,7 @@ struct llm_build_plamo : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -9415,7 +9415,7 @@ struct llm_build_gpt2 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); | ||||
|         cb(pos, "pos_embd", -1); | ||||
| @@ -9525,7 +9525,7 @@ struct llm_build_codeshell : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -9638,7 +9638,7 @@ struct llm_build_orion : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -9765,7 +9765,7 @@ struct llm_build_internlm2 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -9901,7 +9901,7 @@ struct llm_build_minicpm3 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -10096,7 +10096,7 @@ struct llm_build_gemma : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -10212,7 +10212,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified_iswa(); | ||||
|         auto * inp_attn = build_attn_inp_kv_iswa(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -10346,7 +10346,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context { | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         // TODO: is causal == true correct? might need some changes | ||||
|         auto * inp_attn = build_attn_inp_kv_unified_iswa(); | ||||
|         auto * inp_attn = build_attn_inp_kv_iswa(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -10497,7 +10497,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         // TODO: is causal == true correct? might need some changes | ||||
|         auto * inp_attn = build_attn_inp_kv_unified_iswa(); | ||||
|         auto * inp_attn = build_attn_inp_kv_iswa(); | ||||
|  | ||||
|         // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer] | ||||
|         ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs()); | ||||
| @@ -10904,7 +10904,7 @@ struct llm_build_starcoder2 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -11473,7 +11473,7 @@ struct llm_build_command_r : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -11620,7 +11620,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified_iswa(); | ||||
|         auto * inp_attn = build_attn_inp_kv_iswa(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -11755,7 +11755,7 @@ struct llm_build_olmo : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -11883,7 +11883,7 @@ struct llm_build_olmo2 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -12012,7 +12012,7 @@ struct llm_build_olmoe : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -12138,7 +12138,7 @@ struct llm_build_openelm : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -12269,7 +12269,7 @@ struct llm_build_gptneox : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -12415,7 +12415,7 @@ struct llm_build_arctic : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -12553,7 +12553,7 @@ struct llm_build_deepseek : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; | ||||
|  | ||||
| @@ -12730,7 +12730,7 @@ struct llm_build_deepseek2 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -12977,7 +12977,7 @@ struct llm_build_bitnet : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -13241,7 +13241,7 @@ struct llm_build_t5_dec : public llm_graph_context { | ||||
|  | ||||
|         const int64_t n_outputs_enc = embd_enc->ne[1]; | ||||
|  | ||||
|         auto * inp_attn_self  = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn_self  = build_attn_inp_kv(); | ||||
|         auto * inp_attn_cross = build_attn_inp_cross(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
| @@ -13406,7 +13406,7 @@ struct llm_build_jais : public llm_graph_context { | ||||
|  | ||||
|         inpL = build_inp_embd(model.tok_embd); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -13504,7 +13504,7 @@ struct llm_build_chatglm : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -13637,7 +13637,7 @@ struct llm_build_glm4 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -13787,7 +13787,7 @@ struct llm_build_glm4_moe : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -13947,7 +13947,7 @@ struct llm_build_nemotron : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -14076,7 +14076,7 @@ struct llm_build_exaone : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -14208,13 +14208,13 @@ struct llm_build_exaone4 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>; | ||||
|         using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>; | ||||
|         inp_attn_type * inp_attn = nullptr; | ||||
|  | ||||
|         if constexpr (iswa) { | ||||
|             inp_attn = build_attn_inp_kv_unified_iswa(); | ||||
|             inp_attn = build_attn_inp_kv_iswa(); | ||||
|         } else { | ||||
|             inp_attn = build_attn_inp_kv_unified(); | ||||
|             inp_attn = build_attn_inp_kv(); | ||||
|         } | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
| @@ -15097,7 +15097,7 @@ struct llm_build_granite : public llm_graph_context { | ||||
|             inp_pos = build_inp_pos(); | ||||
|         } | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -15148,12 +15148,12 @@ struct llm_build_granite : public llm_graph_context { | ||||
|     } | ||||
|  | ||||
|     ggml_tensor * build_attention_layer( | ||||
|               ggml_tensor                     * cur, | ||||
|               ggml_tensor                     * inp_pos, | ||||
|               llm_graph_input_attn_kv_unified * inp_attn, | ||||
|         const llama_model                     & model, | ||||
|         const int64_t                           n_embd_head, | ||||
|         const int                               il) { | ||||
|               ggml_tensor             * cur, | ||||
|               ggml_tensor             * inp_pos, | ||||
|               llm_graph_input_attn_kv * inp_attn, | ||||
|         const llama_model             & model, | ||||
|         const int64_t                 n_embd_head, | ||||
|         const int                     il) { | ||||
|  | ||||
|         // compute Q and K and (optionally) RoPE them | ||||
|         ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); | ||||
| @@ -15367,12 +15367,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba { | ||||
|     } | ||||
|  | ||||
|     ggml_tensor * build_attention_layer( | ||||
|               ggml_tensor                     * cur, | ||||
|               ggml_tensor                     * inp_pos, | ||||
|               llm_graph_input_attn_kv_unified * inp_attn, | ||||
|         const llama_model                     & model, | ||||
|         const int64_t                           n_embd_head, | ||||
|         const int                               il) { | ||||
|               ggml_tensor             * cur, | ||||
|               ggml_tensor             * inp_pos, | ||||
|               llm_graph_input_attn_kv * inp_attn, | ||||
|         const llama_model             & model, | ||||
|         const int64_t                 n_embd_head, | ||||
|         const int                     il) { | ||||
|  | ||||
|         // compute Q and K and (optionally) RoPE them | ||||
|         ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); | ||||
| @@ -15529,7 +15529,7 @@ struct llm_build_chameleon : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -15860,7 +15860,7 @@ struct llm_build_plm : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -16025,7 +16025,7 @@ struct llm_build_bailingmoe : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -16174,7 +16174,7 @@ struct llm_build_dots1 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -16324,7 +16324,7 @@ struct llm_build_ernie4_5 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         for (int il = 0; il < n_layer; ++il) { | ||||
|             ggml_tensor * inpSA = inpL; | ||||
| @@ -16454,7 +16454,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
|  | ||||
| @@ -16828,7 +16828,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba { | ||||
|  | ||||
| private: | ||||
|     ggml_tensor * build_plamo2_attn_layer( | ||||
|             llm_graph_input_attn_kv_unified * inp, | ||||
|             llm_graph_input_attn_kv * inp, | ||||
|             ggml_tensor * inp_pos, | ||||
|             ggml_tensor * cur, | ||||
|             const llama_model & model, | ||||
| @@ -17061,7 +17061,7 @@ struct llm_build_arcee : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; | ||||
|  | ||||
| @@ -17196,7 +17196,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); | ||||
|  | ||||
| @@ -17357,7 +17357,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); | ||||
|  | ||||
| @@ -17495,7 +17495,7 @@ struct llm_build_smollm3 : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified(); | ||||
|         auto * inp_attn = build_attn_inp_kv(); | ||||
|  | ||||
|         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; | ||||
|  | ||||
| @@ -17627,7 +17627,7 @@ struct llm_build_openai_moe_iswa : public llm_graph_context { | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         auto * inp_attn = build_attn_inp_kv_unified_iswa(); | ||||
|         auto * inp_attn = build_attn_inp_kv_iswa(); | ||||
|  | ||||
|         for (int il = 0; il < n_layer; ++il) { | ||||
|             ggml_tensor * inpSA = inpL; | ||||
| @@ -17809,10 +17809,10 @@ struct llm_build_lfm2 : public llm_graph_context { | ||||
|         return cur; | ||||
|     } | ||||
|  | ||||
|     ggml_tensor * build_attn_block(ggml_tensor                     * cur, | ||||
|                                    ggml_tensor                     * inp_pos, | ||||
|                                    llm_graph_input_attn_kv_unified * inp_attn, | ||||
|                                    int                               il) const { | ||||
|     ggml_tensor * build_attn_block(ggml_tensor             * cur, | ||||
|                                    ggml_tensor             * inp_pos, | ||||
|                                    llm_graph_input_attn_kv * inp_attn, | ||||
|                                    int                     il) const { | ||||
|         GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il)); | ||||
|         auto const n_embd_head = hparams.n_embd_head_v; | ||||
|         auto const n_head_kv = hparams.n_head_kv(il); | ||||
| @@ -17940,13 +17940,13 @@ struct llm_build_smallthinker : public llm_graph_context{ | ||||
|         // inp_pos - contains the positions | ||||
|         ggml_tensor * inp_pos = build_inp_pos(); | ||||
|  | ||||
|         using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>; | ||||
|         using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>; | ||||
|         inp_attn_type * inp_attn = nullptr; | ||||
|  | ||||
|         if constexpr (iswa) { | ||||
|             inp_attn = build_attn_inp_kv_unified_iswa(); | ||||
|             inp_attn = build_attn_inp_kv_iswa(); | ||||
|         } else { | ||||
|             inp_attn = build_attn_inp_kv_unified(); | ||||
|             inp_attn = build_attn_inp_kv(); | ||||
|         } | ||||
|  | ||||
|         ggml_tensor * inp_out_ids = build_inp_out_ids(); | ||||
| @@ -18076,7 +18076,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, | ||||
|                             std::max((uint32_t) 1, cparams.n_seq_max), | ||||
|                             cparams.n_seq_max); | ||||
|                 } else if (llm_arch_is_hybrid(arch)) { | ||||
|                     const auto padding = llama_kv_cache_unified::get_padding(cparams); | ||||
|                     const auto padding = llama_kv_cache::get_padding(cparams); | ||||
|  | ||||
|                     cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding); | ||||
|  | ||||
| @@ -18098,7 +18098,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, | ||||
|                         /* filter_attn       */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr, | ||||
|                         /* filter_recr       */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr); | ||||
|                 } else { | ||||
|                     const auto padding = llama_kv_cache_unified::get_padding(cparams); | ||||
|                     const auto padding = llama_kv_cache::get_padding(cparams); | ||||
|  | ||||
|                     uint32_t n_ctx_per_stream = cparams.n_ctx; | ||||
|  | ||||
| @@ -18118,7 +18118,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, | ||||
|                     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { | ||||
|                         GGML_ASSERT(hparams.is_swa_any()); | ||||
|  | ||||
|                         res = new llama_kv_cache_unified_iswa( | ||||
|                         res = new llama_kv_cache_iswa( | ||||
|                                 *this, | ||||
|                                 params.type_k, | ||||
|                                 params.type_v, | ||||
| @@ -18133,7 +18133,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, | ||||
|                     } else { | ||||
|                         GGML_ASSERT(!hparams.is_swa_any()); | ||||
|  | ||||
|                         res = new llama_kv_cache_unified( | ||||
|                         res = new llama_kv_cache( | ||||
|                                 *this, | ||||
|                                 nullptr, | ||||
|                                 params.type_k, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov