From 03ea04175da3cdd7fd8cc1835f33490e3483928a Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 5 Nov 2025 01:27:42 -0800 Subject: [PATCH] ggml webgpu: minor set rows optimization (#16810) * Add buffer label and enable dawn-specific toggles to turn off some checks * Minor set_rows optimization (#4) * updated optimization, fixed errors * non vectorized version now dispatches one thread per element * Simplify * Change logic for set_rows pipelines --------- Co-authored-by: Neha Abbas Co-authored-by: Neha Abbas Co-authored-by: Reese Levine * Comment on dawn toggles * Remove some comments * Implement overlap binary operators * Revert "Implement overlap binary operators" This reverts commit ed710b36f51ab3f53fa13db15c1685dc8678a32a. * Disable support for non-contiguous binary_op tensors and leave note for future support --------- Co-authored-by: neha-ha <137219201+neha-ha@users.noreply.github.com> Co-authored-by: Neha Abbas Co-authored-by: Neha Abbas --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 88 ++++++++++++++----- .../{set_rows.wgsl => set_rows.tmpl.wgsl} | 45 ++++++++-- 2 files changed, 103 insertions(+), 30 deletions(-) rename ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl => set_rows.tmpl.wgsl} (69%) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 05e16cd432..1a15756731 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -248,7 +248,7 @@ struct webgpu_context_struct { webgpu_pipeline memset_pipeline; webgpu_pipeline mul_mat_pipeline[30][2]; - webgpu_pipeline set_rows_pipeline; + webgpu_pipeline set_rows_pipeline[1][2]; // dst->type, vectorized webgpu_pipeline get_rows_pipeline[30]; webgpu_pipeline get_rows_f32_no_vec_pipeline; webgpu_pipeline cpy_pipeline[2][2]; // src type, dst type @@ -309,10 +309,12 @@ struct ggml_backend_webgpu_context { struct ggml_backend_webgpu_buffer_context { webgpu_context webgpu_ctx; wgpu::Buffer buffer; + std::string label; - ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf) : + ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf, std::string lbl) : webgpu_ctx(std::move(ctx)), - buffer(std::move(buf)) {} + buffer(std::move(buf)), + label(std::move(lbl)) {} }; /* End struct definitions */ @@ -764,10 +766,20 @@ static std::optional ggml_webgpu_set_rows(webgpu_context & ctx, { .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() } }; - size_t max_wg_size = ctx->max_wg_size_x; - uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size; + size_t max_wg_size = ctx->max_wg_size_x; - return ggml_backend_webgpu_build(ctx, ctx->set_rows_pipeline, params, entries, wg_x, error_bufs); + int vectorized = src->ne[0] % 4 == 0; + webgpu_pipeline pipeline = ctx->set_rows_pipeline[0][vectorized]; + uint32_t threads; + if (vectorized) { + threads = (src->ne[1] * src->ne[2] * src->ne[3]) * (src->ne[0] / 4); + } else { + threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3]; + } + + uint32_t wg_x = (threads + max_wg_size - 1) / max_wg_size; + + return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, error_bufs); } static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx, @@ -1336,11 +1348,11 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe WEBGPU_CPU_PROFILE_TOTAL_START(memset_tensor); - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " - << offset << ", " << size << ")"); - ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buf_ctx->label << ", " << tensor << ", " << value + << ", " << offset << ", " << size << ")"); + size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; // This is a trick to set all bytes of a u32 to the same 1 byte value. @@ -1354,12 +1366,13 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, const void * data, size_t offset, size_t size) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " - << offset << ", " << size << ")"); WEBGPU_CPU_PROFILE_TOTAL_START(set_tensor); ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buf_ctx->label << ", " << tensor << ", " << data + << ", " << offset << ", " << size << ")"); + size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4); @@ -1397,12 +1410,12 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, void * data, size_t offset, size_t size) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " - << offset << ", " << size << ")"); WEBGPU_CPU_PROFILE_TOTAL_START(get_tensor); - ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; - webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; - wgpu::Device device = webgpu_ctx->device; + ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buf_ctx->label << ", " << tensor << ", " << data + << ", " << offset << ", " << size << ")"); + webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; + wgpu::Device device = webgpu_ctx->device; size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; @@ -1473,16 +1486,20 @@ static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer(" << size << ")"); + static std::atomic buffer_count; + int buffer_id = buffer_count++; + std::string buf_name = "tensor_buf" + std::to_string(buffer_id); + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer_" << buffer_id << ": " << size << " bytes"); ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); wgpu::Buffer buf; ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1), wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, - "allocated_buffer"); + buf_name.c_str()); - ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf); + ggml_backend_webgpu_buffer_context * buf_ctx = + new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf, buf_name); return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size); } @@ -1613,8 +1630,10 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) { - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline, wgsl_set_rows, "set_rows", - ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x)); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][0], wgsl_set_rows_f16, + "set_rows_f16", ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x)); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][1], wgsl_set_rows_f16_vec, + "set_rows_f16_vec", ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x)); } static void ggml_webgpu_init_get_rows_pipeline(webgpu_context & webgpu_ctx) { @@ -1950,8 +1969,10 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: + // TODO: support non-contiguous tensors, e.g. for MOE_EXPERT_REDUCE + // see https://github.com/ggml-org/llama.cpp/pull/16857 supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type) && - (src1->type == op->type); + (src1->type == op->type) && ggml_is_contiguous(src0) && ggml_is_contiguous(src1); break; case GGML_OP_CPY: case GGML_OP_CONT: @@ -2129,6 +2150,19 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t required_features.push_back(wgpu::FeatureName::TimestampQuery); #endif + // Enable Dawn-specific toggles to increase native performance + // TODO: Don't enable for WASM builds, they won't have an effect anyways + // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these, + // only for native performance? + const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init", + "disable_polyfills_on_integer_div_and_mod" }; + const char * const deviceDisabledToggles[] = { "timestamp_quantization" }; + wgpu::DawnTogglesDescriptor deviceTogglesDesc; + deviceTogglesDesc.enabledToggles = deviceEnabledToggles; + deviceTogglesDesc.enabledToggleCount = 4; + deviceTogglesDesc.disabledToggles = deviceDisabledToggles; + deviceTogglesDesc.disabledToggleCount = 1; + wgpu::DeviceDescriptor dev_desc; dev_desc.requiredLimits = &ctx->limits; dev_desc.requiredFeatures = required_features.data(); @@ -2146,6 +2180,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), std::string(message).c_str()); }); + dev_desc.nextInChain = &deviceTogglesDesc; ctx->instance.WaitAny(ctx->adapter.RequestDevice( &dev_desc, wgpu::CallbackMode::AllowSpontaneous, [ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { @@ -2243,11 +2278,18 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() { ctx.name = GGML_WEBGPU_NAME; ctx.device_count = 1; + const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" }; + + wgpu::DawnTogglesDescriptor instanceTogglesDesc; + instanceTogglesDesc.enabledToggles = instanceEnabledToggles; + instanceTogglesDesc.enabledToggleCount = 1; wgpu::InstanceDescriptor instance_descriptor{}; std::vector instance_features = { wgpu::InstanceFeatureName::TimedWaitAny }; instance_descriptor.requiredFeatures = instance_features.data(); instance_descriptor.requiredFeatureCount = instance_features.size(); - webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor); + instance_descriptor.nextInChain = &instanceTogglesDesc; + + webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor); GGML_ASSERT(webgpu_ctx->instance != nullptr); static ggml_backend_reg reg = { diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl similarity index 69% rename from ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl rename to ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl index 3567713dc2..fca3be6bc2 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl @@ -1,13 +1,38 @@ +#define(VARIANTS) + +[ + { + "SHADER_SUFFIX": "f16_vec", + "REPLS": { + "TYPE" : "vec4", + "DST_TYPE": "vec4", + "VEC_SIZE": 4 + } + }, + { + "SHADER_SUFFIX": "f16", + "REPLS": { + "TYPE" : "f32", + "DST_TYPE": "f16", + "VEC_SIZE": 1 + } + } +] + +#end(VARIANTS) + +#define(SHADER) + enable f16; @group(0) @binding(0) -var src: array; +var src: array<{{TYPE}}>; @group(0) @binding(1) var idx: array; @group(0) @binding(2) -var dst: array; +var dst: array<{{DST_TYPE}}>; @group(0) @binding(3) var error: atomic; @@ -47,10 +72,14 @@ var params: Params; override wg_size: u32; @compute @workgroup_size(wg_size) fn main(@builtin(global_invocation_id) gid: vec3) { - if (gid.x >= params.n_rows * params.ne2 * params.ne3) { + if (gid.x >= (params.ne3 * params.ne2 * params.n_rows * params.ne0) / {{VEC_SIZE}}) { return; } - var i = gid.x; + + // getting the row from gid + let elems_per_row = params.ne0 / {{VEC_SIZE}}; + var i = gid.x / elems_per_row; + let i_src3 = i / (params.ne2 * params.n_rows); i = i % (params.ne2 * params.n_rows); @@ -75,7 +104,9 @@ fn main(@builtin(global_invocation_id) gid: vec3) { let i_dst_row = params.offset_dst + idx_high_val * params.stride_dst1 + i_src2 * params.stride_dst2 + i_src3 * params.stride_dst3; let i_src_row = params.offset_src + i_src1 * params.stride_src1 + i_src2 * params.stride_src2 + i_src3 * params.stride_src3; - for (var i: u32 = 0; i < params.ne0; i++) { - dst[i_dst_row + i] = f16(src[i_src_row + i]); - } + let col_idx = (gid.x % elems_per_row); + dst[i_dst_row/{{VEC_SIZE}} + col_idx] = {{DST_TYPE}}(src[i_src_row/{{VEC_SIZE}} + col_idx]); } + +#end(SHADER) +