mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-15 11:17:31 +00:00
* Add paramater buffer pool, batching of submissions, refactor command building/submission
* Add header for linux builds
* Free staged parameter buffers at once
* Format with clang-format
* Fix thread-safe implementation
* Use device implicit synchronization
* Update workflow to use custom release
* Remove testing branch workflow
* some f32 tests passing
* Disable set_rows until it's implemented
* f32 add all tests passing
* Begin work on set_rows
* Work on set rows
* Add error buffers for reporting unsupported SET_ROWS indices
* Remove extra comments
* Add templated addition, clean up code
* Get addition and multiplication working
* Implement rms_norm
* Add get_rows implementation
* Add new get_rows files
* Refactor use of wg size entry
* Fix compilation
* Try manually unrolled q4_0 quant
* Revert "Try manually unrolled q4_0 quant"
This reverts commit 77f8b96515.
* Move to constant max wg size
* Check for tensor size in supports_op
* Vectorize f32 and change default workgroup size
* Move f32 get_rows from < 4 to % 4 != 0
* fix linter errors
* Add in-place tests
---------
Co-authored-by: Neha Abbas <nehaabbas@ReeseLevines-MacBook-Pro.local>
58 lines
1.4 KiB
WebGPU Shading Language
58 lines
1.4 KiB
WebGPU Shading Language
@group(0) @binding(0)
|
|
var<storage, read_write> src: array<f32>;
|
|
|
|
@group(0) @binding(1)
|
|
var<storage, read_write> dst: array<f32>;
|
|
|
|
struct Params {
|
|
offset_src: u32, // in elements
|
|
offset_dst: u32, // in elements
|
|
|
|
// Strides (in elements)
|
|
stride_src1: u32,
|
|
stride_src2: u32,
|
|
stride_src3: u32,
|
|
|
|
stride_dst1: u32,
|
|
stride_dst2: u32,
|
|
stride_dst3: u32,
|
|
|
|
// Shape of src/dst
|
|
ne0: u32,
|
|
ne1: u32,
|
|
ne2: u32,
|
|
ne3: u32,
|
|
|
|
eps: u32
|
|
};
|
|
|
|
@group(0) @binding(2)
|
|
var<uniform> params: Params;
|
|
|
|
override wg_size: u32;
|
|
@compute @workgroup_size(wg_size)
|
|
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
|
if (gid.x >= params.ne1 * params.ne2 * params.ne3) {
|
|
return;
|
|
}
|
|
|
|
// one thread per row
|
|
var i = gid.x;
|
|
let i3 = i / (params.ne2 * params.ne1);
|
|
i = i % (params.ne2 * params.ne1);
|
|
let i2 = i / params.ne1;
|
|
let i1 = i % params.ne1;
|
|
let i_src_row = params.offset_src + i3 * params.stride_src3 + i2 * params.stride_src2 + i1 * params.stride_src1;
|
|
let i_dst_row = params.offset_src + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
|
|
|
|
var sum = 0.0f;
|
|
for (var j: u32 = 0; j < params.ne0; j++) {
|
|
sum += src[i_src_row + j] * src[i_src_row + j];
|
|
}
|
|
let eps = bitcast<f32>(params.eps);
|
|
let scale = 1.0/sqrt(sum/f32(params.ne0) + eps);
|
|
for (var j: u32 = 0; j < params.ne0; j++) {
|
|
dst[i_dst_row + j] = scale * src[i_src_row + j];
|
|
}
|
|
}
|