mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	metal : improve FA + improve MoE (#12612)
* ggml : FA with different K, V head sizes (CPU) ggml-ci * metal : add FA with HS=192 * metal : extend FA to support different K and V head sizes ggml-ci * metal : add FA vector kernels for heads K 192 and V 128 ggml-ci * ggml : restrict op on other backends to equal head sizes ggml-ci * metal : optimize FA-vec kernel ggml-ci * metal : FA remove mq registers * metal : improve MoE mul_mat_id condition ggml-ci * metal : fix comments + remove unnecessary addition ggml-ci * metal : avoid too much shared memory usage with mul_mat_id ggml-ci
This commit is contained in:
		| @@ -2316,11 +2316,6 @@ llama_context * llama_init_from_model( | ||||
|         params.flash_attn = false; | ||||
|     } | ||||
|  | ||||
|     if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) { | ||||
|         LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__); | ||||
|         params.flash_attn = false; | ||||
|     } | ||||
|  | ||||
|     if (ggml_is_quantized(params.type_v) && !params.flash_attn) { | ||||
|         LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__); | ||||
|         return nullptr; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov