mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
CUDA: better error for FA kernel with 0 occupancy (#16643)
This commit is contained in:
@@ -895,6 +895,7 @@ void launch_fattn(
|
|||||||
const dim3 block_dim(warp_size, nwarps, 1);
|
const dim3 block_dim(warp_size, nwarps, 1);
|
||||||
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
|
int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
|
||||||
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
|
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
|
||||||
|
GGML_ASSERT(max_blocks_per_sm > 0);
|
||||||
int parallel_blocks = max_blocks_per_sm;
|
int parallel_blocks = max_blocks_per_sm;
|
||||||
|
|
||||||
dim3 blocks_num;
|
dim3 blocks_num;
|
||||||
|
|||||||
Reference in New Issue
Block a user