mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	llama : DeepSeek V2/V3 MLA implementation (#12801)
* Merged using squash to remove all noise commit messages * Force flash attention off for `LLM_ARCH_DEEPSEEK2` - embedding too large * Removed 3 conts (2x RoPE and 1x RMS-norm) * Changed to use `<cmath>` instead of `<math.h>` * Reverted removal of the 3 conts * Used `reshape` in `llm_graph_context::build_attn_mha()` * Use `k_pe = ggml_reshape` * Removed the 3 conts again * Removed the 3D views of `wk_b` and `wv_b`, and just save and 3D in GGUF * Removed MQA optimisation from `build_attn_mha()` as no gains now * Simplified `is_mla` branch in `llm_build_deepseek2()` * Removed `build_attn_mla` and added `nullptr` to all `build_atnn` calls * Fixed call to `build_attn` in `llm_build_t5_enc`
This commit is contained in:
		@@ -43,6 +43,10 @@ struct llama_hparams {
 | 
			
		||||
    uint32_t n_expert_used = 0;
 | 
			
		||||
    uint32_t n_rel_attn_bkts = 0;
 | 
			
		||||
 | 
			
		||||
    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
 | 
			
		||||
    uint32_t n_embd_head_k_mla = 0;
 | 
			
		||||
    uint32_t n_embd_head_v_mla = 0;
 | 
			
		||||
 | 
			
		||||
    // for WavTokenizer
 | 
			
		||||
    struct llama_hparams_posnet   posnet;
 | 
			
		||||
    struct llama_hparams_convnext convnext;
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user