mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
CUDA: Add mul_mat_id support for the mmf kernel (#15767)
* CUDA: Add mul_mat_id support the mmf Add support for mul_mat_id for bs < 16 * Review: use warp_size, fix should_use_mmf condition * Launch one block per expert, stride along n_expert_used * templatize mul_mat_id * Pad shmem to 16 bytes, add helper function mul_mat_f_switch_ids * Reduce compile times by dividing mmf into f16, bf16 and f32 variants * Divide mmf by ncols_dst * Add missing files * Fix MUSA/HIP builds
This commit is contained in:
@@ -6261,7 +6261,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
for (int n_mats : {4, 8}) {
|
||||
for (int n_used : {1, 2, 4}) {
|
||||
for (bool b : {false, true}) {
|
||||
for (int n : {1, 32, 129}) {
|
||||
for (int n : {1, 4, 5, 32, 129}) {
|
||||
int m = 512;
|
||||
int k = 256;
|
||||
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
|
||||
|
||||
Reference in New Issue
Block a user