CUDA: add fp kernel for larger batch size MoE (#16512)

* CUDA: kernel for larger batch sizes for MoE

* WIP

* WIP

* WIP

* WIP

* WIP

* WIP

* fixup

* tests

* Move mmq_ids_helper to mmid

* cleanup

* Remove redundant checks
This commit is contained in:
Aman Gupta
2025-10-14 19:15:15 +08:00
committed by GitHub
parent 5b6913c47b
commit 48e2fa9fb7
6 changed files with 535 additions and 204 deletions

View File

@@ -6911,7 +6911,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
}
// qwen3-30b-a3b
for (int bs : {1, 4, 8, 32, 64, 128, 512}) {
for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 128, 8, false, 768, bs, 2048, 1));
@@ -6919,6 +6919,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
}
}
for (int bs : {1, 4, 8, 32, 64, 128, 256, 512}) {
for (ggml_type type_a : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XS}) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, 32, 4, false, 1792, bs, 2048, 1));
}
}
}
// gpt-oss-20b
for (int bs : {1, 4, 8, 512}) {
for (ggml_type type_a : {GGML_TYPE_MXFP4}) {