mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	CUDA: app option to compile without FlashAttention (#12025)
This commit is contained in:
		@@ -204,9 +204,9 @@ typedef float2 dfloat2;
 | 
			
		||||
#define CP_ASYNC_AVAILABLE
 | 
			
		||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 | 
			
		||||
 | 
			
		||||
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
 | 
			
		||||
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
 | 
			
		||||
#define FLASH_ATTN_AVAILABLE
 | 
			
		||||
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
 | 
			
		||||
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
 | 
			
		||||
 | 
			
		||||
static bool fp16_available(const int cc) {
 | 
			
		||||
    return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user