Files
llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl
Reese Levine d304f459d8 GGML WebGPU: Support for ADD, MUL, RMS_NORM, GET_ROWS operators (#16018)
* Add paramater buffer pool, batching of submissions, refactor command building/submission

* Add header for linux builds

* Free staged parameter buffers at once

* Format with clang-format

* Fix thread-safe implementation

* Use device implicit synchronization

* Update workflow to use custom release

* Remove testing branch workflow

* some f32 tests passing

* Disable set_rows until it's implemented

* f32 add all tests passing

* Begin work on set_rows

* Work on set rows

* Add error buffers for reporting unsupported SET_ROWS indices

* Remove extra comments

* Add templated addition, clean up code

* Get addition and multiplication working

* Implement rms_norm

* Add get_rows implementation

* Add new get_rows files

* Refactor use of wg size entry

* Fix compilation

* Try manually unrolled q4_0 quant

* Revert "Try manually unrolled q4_0 quant"

This reverts commit 77f8b96515.

* Move to constant max wg size

* Check for tensor size in supports_op

* Vectorize f32 and change default workgroup size

* Move f32 get_rows from < 4 to % 4 != 0

* fix linter errors

* Add in-place tests

---------

Co-authored-by: Neha Abbas <nehaabbas@ReeseLevines-MacBook-Pro.local>
2025-09-17 13:09:40 -07:00

46 lines
1.1 KiB
Cheetah

struct Params {
ne: u32,
// offsets in elements
offset_src0: u32,
offset_src1: u32,
offset_dst: u32,
stride_src1_0: u32,
stride_src1_1: u32,
stride_src1_2: u32,
stride_src1_3: u32,
a_ne0: u32,
a_ne1: u32,
a_ne2: u32,
b_ne0: u32,
b_ne1: u32,
b_ne2: u32,
b_ne3: u32,
};
fn src1_index(_i: u32) -> u32 {
var i = _i;
let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
let a_i2 = i / (params.a_ne1 * params.a_ne0);
i = i % (params.a_ne1 * params.a_ne0);
let a_i1 = i / params.a_ne0;
let a_i0 = i % params.a_ne0;
// handle repetition of b
// index loops back to the beginning and repeats after elements are exhausted = modulo
let b_i0 = a_i0 % params.b_ne0;
let b_i1 = a_i1 % params.b_ne1;
let b_i2 = a_i2 % params.b_ne2;
let b_i3 = a_i3 % params.b_ne3;
// compute index for position in b's flat array
return b_i0 * params.stride_src1_0 +
b_i1 * params.stride_src1_1 +
b_i2 * params.stride_src1_2 +
b_i3 * params.stride_src1_3;
}