mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	llama : DeepSeek V2/V3 MLA implementation (#12801)
* Merged using squash to remove all noise commit messages * Force flash attention off for `LLM_ARCH_DEEPSEEK2` - embedding too large * Removed 3 conts (2x RoPE and 1x RMS-norm) * Changed to use `<cmath>` instead of `<math.h>` * Reverted removal of the 3 conts * Used `reshape` in `llm_graph_context::build_attn_mha()` * Use `k_pe = ggml_reshape` * Removed the 3 conts again * Removed the 3D views of `wk_b` and `wv_b`, and just save and 3D in GGUF * Removed MQA optimisation from `build_attn_mha()` as no gains now * Simplified `is_mla` branch in `llm_build_deepseek2()` * Removed `build_attn_mla` and added `nullptr` to all `build_atnn` calls * Fixed call to `build_attn` in `llm_build_t5_enc`
This commit is contained in:
		| @@ -140,6 +140,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = { | ||||
|     { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       }, | ||||
|     { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               }, | ||||
|     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        }, | ||||
|     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               }, | ||||
|     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             }, | ||||
|  | ||||
|     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 }, | ||||
|     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              }, | ||||
| @@ -1103,6 +1105,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N | ||||
|             { LLM_TENSOR_ATTN_Q_B,           "blk.%d.attn_q_b" }, | ||||
|             { LLM_TENSOR_ATTN_KV_A_MQA,      "blk.%d.attn_kv_a_mqa" }, | ||||
|             { LLM_TENSOR_ATTN_KV_B,          "blk.%d.attn_kv_b" }, | ||||
|             { LLM_TENSOR_ATTN_K_B,           "blk.%d.attn_k_b" }, | ||||
|             { LLM_TENSOR_ATTN_V_B,           "blk.%d.attn_v_b" }, | ||||
|             { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" }, | ||||
|             { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" }, | ||||
|             { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" }, | ||||
| @@ -1563,23 +1567,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = { | ||||
|     {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_Q,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_K,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_V,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_QKV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_OUT,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_FFN_GATE,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_FFN_DOWN,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_FFN_UP,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_FFN_DOWN_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_FFN_GATE_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_FFN_UP_SHEXP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_Q_A,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_K_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_ATTN_V_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|     {LLM_TENSOR_DEC_ATTN_V,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Juk Armstrong
					Juk Armstrong