CUDA: optimize FA for GQA + large batches (#12014)

This commit is contained in:
Johannes Gäßler
2025-02-22 12:20:17 +01:00
committed by GitHub
parent 335eb04a91
commit 5fa07c2f93
32 changed files with 940 additions and 411 deletions

View File

@@ -24,7 +24,7 @@ static __device__ __forceinline__ void cp_async_cg_16(const unsigned int dst, co
} else
#endif // CUDART_VERSION >= 11040
{
asm volatile("cp.async.cg.shared.global.L2 [%0], [%1], 16;"
asm volatile("cp.async.cg.shared.global [%0], [%1], 16;"
: : "r"(dst), "l"(src));
}
#else