metal : optimize MoE for large batches (#13388)

ggml-ci
2025-11-01 09:01:57 +00:00 · 2025-05-09 15:14:56 +03:00
parent 0cf6725e9f
commit 611aa914ef
4 changed files with 458 additions and 293 deletions
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2732,11 +2732,11 @@ void ggml_mul_mat_set_prec(
    c = ggml_mul_mat_id(ctx, as, b, ids);

    as  -> [cols, rows, n_expert]
-    ids -> [n_experts_used, n_tokens] (i32)
    b   -> [cols, n_expert_used, n_tokens]
+    ids -> [n_expert_used, n_tokens] (i32)
    c   -> [rows, n_expert_used, n_tokens]

-    in b, n_experts_used can be broadcasted to match the n_expert_used of ids
+    in b, n_expert_used can be broadcasted to match the n_expert_used of ids

    c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
 */