metal : optimize MoE for large batches (llama/13388)

2025-08-18 07:10:09 +02:00 · 2025-05-13 13:09:20 +03:00
parent 029c8837f8
commit 41ed62bdbc
4 changed files with 822 additions and 331 deletions
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2732,11 +2732,11 @@ void ggml_mul_mat_set_prec(
    c = ggml_mul_mat_id(ctx, as, b, ids);

    as  -> [cols, rows, n_expert]
-    ids -> [n_experts_used, n_tokens] (i32)
    b   -> [cols, n_expert_used, n_tokens]
+    ids -> [n_expert_used, n_tokens] (i32)
    c   -> [rows, n_expert_used, n_tokens]

-    in b, n_experts_used can be broadcasted to match the n_expert_used of ids
+    in b, n_expert_used can be broadcasted to match the n_expert_used of ids

    c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
 */