mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	CPU/CUDA: Gemma 2 FlashAttention support (#8542)
* CPU/CUDA: Gemma 2 FlashAttention support * apply logit_softcap to scale in kernel * disable logit softcapping tests on Metal * remove metal check
This commit is contained in:
		@@ -1760,7 +1760,8 @@ extern "C" {
 | 
			
		||||
            struct ggml_tensor  * v,
 | 
			
		||||
            struct ggml_tensor  * mask,
 | 
			
		||||
            float                 scale,
 | 
			
		||||
            float                 max_bias);
 | 
			
		||||
            float                 max_bias,
 | 
			
		||||
            float                 logit_softcap);
 | 
			
		||||
 | 
			
		||||
    GGML_API void ggml_flash_attn_ext_set_prec(
 | 
			
		||||
            struct ggml_tensor * a,
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user