CUDA: Add fastdiv to k_bin_bcast*, giving 1-3% E2E performance (#15872)

* Add fastdiv and fastmodulo to k_bin_bcast kernel * Address review comments * `prod_` instead of `prod` suffix * Add test case for `k_bin_bcast_unravel` in CUDA backend
2025-10-27 08:21:30 +00:00 · 2025-09-10 22:04:03 +02:00
parent 4f658855fa
commit 00681dfc16
2 changed files with 111 additions and 74 deletions
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6050,6 +6050,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
        add_test_bin_bcast(type, {10, 5, 4, 3}, {1, 2, 2, 2});
        add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 2, 2, 2});

+        // test case for k_bin_bcast_unravel in CUDA backend
+        add_test_bin_bcast(type, {1, 1, 65536, 1}, {256, 1, 1, 1});
+
        // stable diffusion
        add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 1, 1, 1});
        add_test_bin_bcast(type, {1280, 1, 1, 1}, {1, 16, 16, 1});