vulkan: vec dot matrix multiplication fix (#16151)

* vulkan: fix matrix multiplication index calculation for odd m/n and odd k in combination with batching * add odd m/n + odd k test with batching
2025-11-20 12:07:33 +00:00 · 2025-09-22 07:22:43 +02:00
parent 51f5a45fbe
commit 9073a73d82
4 changed files with 31 additions and 18 deletions
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp
@@ -14,8 +14,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(data_a[idx]);
            buf_a[buf_idx    ] = aa.xy;
            buf_a[buf_idx + 1] = aa.zw;
-#else // LOAD_VEC_A == 2
-            const uint idx = pos_a * 2 + col * p.stride_a + row * 2;
+#else // LOAD_VEC_BATCH_A == 2
+            const uint idx = pos_a + col * p.stride_a + row * 2;
            const uint buf_idx = col * SHMEM_STRIDE + row;
            if (idx_m < p.M && block + row * 2 + 1 < end_k) {
                buf_a[buf_idx] = FLOAT_TYPE_VEC2(data_a[idx],
@@ -33,8 +33,8 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            FLOAT_TYPE_VEC4 aa = FLOAT_TYPE_VEC4(TO_FLOAT_TYPE(data_a[idx]));
            buf_a[buf_idx    ] = aa.xy;
            buf_a[buf_idx + 1] = aa.zw;
-#else // LOAD_VEC_A == 2
-            const uint idx = pos_a * 2 + col * p.stride_a + row * 2;
+#else // LOAD_VEC_BATCH_A == 2
+            const uint idx = pos_a + col * p.stride_a + row * 2;
            const uint buf_idx = col * SHMEM_STRIDE + row;
            if (idx_m < p.M && block + row * 2 + 1 < end_k) {
                buf_a[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_a[idx]),
@@ -500,8 +500,8 @@ void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uin
 #endif
            buf_b[buf_idx + 0] = bb.xy;
            buf_b[buf_idx + 1] = bb.zw;
-#else // LOAD_VEC_B == 2
-            const uint idx = pos_b * 2 + col * p.stride_b + row * 2;
+#else // LOAD_VEC_BATCH_B == 2
+            const uint idx = pos_b + col * p.stride_b + row * 2;
            const uint buf_idx = col * SHMEM_STRIDE + row;
            if (idx_n < p.N && block + row * 2 + 1 < end_k) {
                buf_b[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_b[idx]),
@@ -536,17 +536,17 @@ void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uin
 #endif
            buf_b[buf_idx + 0] = bb.xy;
            buf_b[buf_idx + 1] = bb.zw;
-#else // LOAD_VEC_B == 2
+#else // LOAD_VEC_BATCH_B == 2
            const uint row_i = ic * BN + col;
            const uint buf_idx = col * SHMEM_STRIDE + row;
            if (row_i < _ne1 && block + row * 2 + 1 < end_k) {
                const u16vec2 row_idx = row_ids[col];
-                const uint idx = pos_b * 2 + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
+                const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
                buf_b[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_b[idx]),
                                                 TO_FLOAT_TYPE(data_b[idx + 1]));
            } else if (row_i < _ne1 && block + row * 2 < end_k) {
                const u16vec2 row_idx = row_ids[col];
-                const uint idx = pos_b * 2 + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
+                const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2;
                buf_b[buf_idx] = FLOAT_TYPE_VEC2(TO_FLOAT_TYPE(data_b[idx]), 0.0f);
            } else {
                buf_b[buf_idx] = FLOAT_TYPE_VEC2(0.0f);