ggml: fix CUDA grid launch condition for large block_nums.y in binbcast (#16742)

* Fix CUDA grid launch condition for large block_nums.y

* add backend ops test

* reduce test  repetitions
This commit is contained in:
leejet
2025-10-25 03:39:37 +08:00
committed by GitHub
parent 0bcb40b48c
commit 55945d2ef5
2 changed files with 2 additions and 1 deletions

View File

@@ -272,7 +272,7 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]);
const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
if (block_nums.z > 65535) {
if (block_nums.z > 65535 || block_nums.y > 65535) {
int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1));