mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
* Add paramater buffer pool, batching of submissions, refactor command building/submission
* Add header for linux builds
* Free staged parameter buffers at once
* Format with clang-format
* Fix thread-safe implementation
* Use device implicit synchronization
* Update workflow to use custom release
* Remove testing branch workflow
* some f32 tests passing
* Disable set_rows until it's implemented
* f32 add all tests passing
* Begin work on set_rows
* Work on set rows
* Add error buffers for reporting unsupported SET_ROWS indices
* Remove extra comments
* Add templated addition, clean up code
* Get addition and multiplication working
* Implement rms_norm
* Add get_rows implementation
* Add new get_rows files
* Refactor use of wg size entry
* Fix compilation
* Try manually unrolled q4_0 quant
* Revert "Try manually unrolled q4_0 quant"
This reverts commit 77f8b96515.
* Move to constant max wg size
* Check for tensor size in supports_op
* Vectorize f32 and change default workgroup size
* Move f32 get_rows from < 4 to % 4 != 0
* fix linter errors
* Add in-place tests
---------
Co-authored-by: Neha Abbas <nehaabbas@ReeseLevines-MacBook-Pro.local>
46 lines
1.1 KiB
Cheetah
46 lines
1.1 KiB
Cheetah
struct Params {
|
|
ne: u32,
|
|
|
|
// offsets in elements
|
|
offset_src0: u32,
|
|
offset_src1: u32,
|
|
offset_dst: u32,
|
|
|
|
stride_src1_0: u32,
|
|
stride_src1_1: u32,
|
|
stride_src1_2: u32,
|
|
stride_src1_3: u32,
|
|
|
|
a_ne0: u32,
|
|
a_ne1: u32,
|
|
a_ne2: u32,
|
|
|
|
b_ne0: u32,
|
|
b_ne1: u32,
|
|
b_ne2: u32,
|
|
b_ne3: u32,
|
|
};
|
|
|
|
fn src1_index(_i: u32) -> u32 {
|
|
var i = _i;
|
|
let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0);
|
|
i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0);
|
|
let a_i2 = i / (params.a_ne1 * params.a_ne0);
|
|
i = i % (params.a_ne1 * params.a_ne0);
|
|
let a_i1 = i / params.a_ne0;
|
|
let a_i0 = i % params.a_ne0;
|
|
|
|
// handle repetition of b
|
|
// index loops back to the beginning and repeats after elements are exhausted = modulo
|
|
let b_i0 = a_i0 % params.b_ne0;
|
|
let b_i1 = a_i1 % params.b_ne1;
|
|
let b_i2 = a_i2 % params.b_ne2;
|
|
let b_i3 = a_i3 % params.b_ne3;
|
|
|
|
// compute index for position in b's flat array
|
|
return b_i0 * params.stride_src1_0 +
|
|
b_i1 * params.stride_src1_1 +
|
|
b_i2 * params.stride_src1_2 +
|
|
b_i3 * params.stride_src1_3;
|
|
}
|