ggml webgpu: actually add softmax, fix rms_norm offset (#16400)

* implement soft_max * Fix soft_max data race * Temporary fix, wait on each submit
2025-11-10 10:27:03 +00:00 · 2025-10-04 20:59:31 -07:00
parent 86df2c9ae4
commit 35266573b9
3 changed files with 10 additions and 1 deletions
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -424,6 +424,7 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
        ctx->staged_param_bufs.push_back(params_bufs);
        if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
            ggml_backend_webgpu_submit_queue(ctx);
+            ggml_backend_webgpu_wait_on_submission(ctx);
        }
    }
 }
@@ -1060,6 +1061,9 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
        case GGML_OP_SCALE:
            ggml_webgpu_scale(ctx, src0, node);
            break;
+        case GGML_OP_SOFT_MAX:
+            ggml_webgpu_soft_max(ctx, src0, src1, src2, node);
+            break;
        default:
            return false;
    }
@@ -1806,6 +1810,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
        case GGML_OP_SCALE:
            supports_op = op->type == GGML_TYPE_F32;
            break;
+        case GGML_OP_SOFT_MAX:
+            supports_op = op->type == GGML_TYPE_F32;
+            break;
        default:
            break;
    }
@@ -1949,6 +1956,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
    ggml_webgpu_init_rope_pipeline(ctx);
    ggml_webgpu_init_glu_pipeline(ctx);
    ggml_webgpu_init_scale_pipeline(ctx);
+    ggml_webgpu_init_soft_max_pipeline(ctx);

 #ifdef GGML_WEBGPU_DEBUG
    // Initialize debug buffers