From 21db0b598af4c4005ee89e9e95d802e57a92c767 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Fri, 18 Jul 2025 11:05:31 -0600
Subject: [PATCH] fix: Correctly size the shared memory bufer and assert
 expected size relationships

Branch: GraniteFourPerf

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 ggml/src/ggml-metal/ggml-metal.m | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index de7d33046f..d515ec0a32 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -3017,7 +3017,15 @@ static bool ggml_metal_encode_node(
 
                 if (ne30 == 1) {
                     // Mamba-2
-                    [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; // SIMD size
+
+                    // One shared memory bucket for each simd group in the threadgroup
+                    const int64_t shmem_size = d_state / 32;
+                    GGML_ASSERT(shmem_size * 32 == d_state);
+
+                    // One thread pre element in d_state
+                    GGML_ASSERT(d_state <= (int64_t)pipeline.maxTotalThreadsPerThreadgroup);
+
+                    [encoder setThreadgroupMemoryLength:(shmem_size)*sizeof(float) atIndex:0];
                     [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_head, n_seqs) threadsPerThreadgroup:MTLSizeMake(d_state, 1, 1)];
                 } else {
                     GGML_ASSERT(d_inner == 1);