CUDA: app option to compile without FlashAttention (#12025)

This commit is contained in:
Johannes Gäßler
2025-02-22 20:44:34 +01:00
committed by GitHub
parent 36c258ee92
commit a28e0d5eb1
13 changed files with 46 additions and 31 deletions

View File

@@ -41,12 +41,7 @@ static __global__ void flash_attn_vec_ext_f16(
const int ne1,
const int ne2,
const int ne3) {
#ifdef FP16_AVAILABLE
#ifndef FLASH_ATTN_AVAILABLE
NO_DEVICE_CODE;
return;
#endif // FLASH_ATTN_AVAILABLE
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
@@ -300,7 +295,7 @@ static __global__ void flash_attn_vec_ext_f16(
}
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
}
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>