From 9984cbb61d12491d604484fb38b678fa15064061 Mon Sep 17 00:00:00 2001 From: lhez Date: Thu, 30 Oct 2025 16:00:20 -0700 Subject: [PATCH] opencl: fix boundary handling for mul_mm (#16875) --- ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl | 6 +++--- ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl | 4 ++-- ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl index 1a1bfe144f..6982f8f514 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl @@ -79,8 +79,8 @@ kernel void kernel_mul_mm_f16_f32_l4_lm( for (int block = 0; block < ne00; block += BK) { for (int l = 0; l < BM; l += loadstride_a) { - if (loadc_a + l < ne01) { - const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a; + if (ir*BM + loadc_a + l < ne01) { + const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a; buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0; buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1; buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2; @@ -94,7 +94,7 @@ kernel void kernel_mul_mm_f16_f32_l4_lm( } for (int l = 0; l < BN; l += loadstride_b) { - if (loadc_b + l < ne11) { + if (ic*BN + loadc_b + l < ne11) { const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b; buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0; buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1; diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl index 39a5d4868f..d7d5ba647e 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl @@ -79,7 +79,7 @@ kernel void kernel_mul_mm_f32_f32_l4_lm( for (int block = 0; block < ne00; block += BK) { for (int l = 0; l < BM; l += loadstride_a) { - if (loadc_a + l < ne01) { + if (ir*BM + loadc_a + l < ne01) { const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a; buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0; buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1; @@ -94,7 +94,7 @@ kernel void kernel_mul_mm_f32_f32_l4_lm( } for (int l = 0; l < BN; l += loadstride_b) { - if (loadc_b + l < ne11) { + if (ic*BN + loadc_b + l < ne11) { const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b; buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0; buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1; diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl index fd47e8a89d..147b66f669 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl @@ -78,7 +78,7 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm( for (int block = 0; block < ne00; block += BK) { for (int l = 0; l < BM; l += loadstride_a) { - if (loadc_a + l < ne01) { + if (ir*BM + loadc_a + l < ne01) { int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a; int ib = idx / 8; int iqs = idx % 8; @@ -101,7 +101,7 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm( } for (int l = 0; l < BN; l += loadstride_b) { - if (loadc_b + l < ne11) { + if (ic*BN + loadc_b + l < ne11) { int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b; buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0; buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;