CUDA: mul_mat_id for mmf for bs <= 64 for f16 and bs <= 32 for f32 (#16277)

* CUDA: mul_mat_id for mmf for bs <= 64 for f16 and bs <= 32 for f32

This commit adds mul_mat_id support for ncols_dst >= 16. It does this by
packing ncols_dst tiles into the blockDim.y.

My tests on a RTX 3090 show that this is faster than the cuBLAS fallback
for f16 till bs=64, and for f32 till bs=32

* Review: refactor if statement
This commit is contained in:
Aman Gupta
2025-09-28 00:49:32 +08:00
committed by GitHub
parent 75a3a6c2cd
commit c0bfc57af4
4 changed files with 92 additions and 47 deletions

View File

@@ -6329,7 +6329,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
for (int n_mats : {4, 8}) {
for (int n_used : {1, 2, 4}) {
for (bool b : {false, true}) {
for (int n : {1, 4, 5, 32, 129}) {
for (int n : {1, 4, 5, 17, 32, 129}) {
int m = 512;
int k = 256;
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
@@ -6733,7 +6733,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
}
// qwen3-30b-a3b
for (int bs : {1, 4, 8, 512}) {
for (int bs : {1, 4, 8, 32, 64, 128, 512}) {
for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 128, 8, false, 768, bs, 2048, 1));