mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	CUDA: fix FlashAttention on Turing (#13415)
This commit is contained in:
		| @@ -546,7 +546,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( | ||||
|         const int i0_stop = i0_start + 2*c::nbatch_V2 < DV ? i0_start + 2*c::nbatch_V2 : DV; | ||||
|         const int i0_diff = i0_stop - i0_start; | ||||
|  | ||||
|         if (nstages == 1) { | ||||
|         if (nstages <= 1) { | ||||
|             constexpr bool use_cp_async = nstages == 1; | ||||
|             flash_attn_ext_f16_load_tile<stride_tile_V, nwarps, c::nbatch_fa, use_cp_async> | ||||
|                 (V_h2 + k_VKQ_0*stride_V + i0_start/2, tile_V, i0_diff/2, stride_V); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Johannes Gäßler
					Johannes Gäßler