mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-05 09:36:52 +00:00
feat: Use a secondary simd_sum instead of a for loop
Branch: GraniteFourPerf Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
This commit is contained in:
@@ -1840,15 +1840,18 @@ kernel void kernel_ssm_scan_f32_group(
|
|||||||
// sum of the individual simd groups.
|
// sum of the individual simd groups.
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||||
|
|
||||||
// Sum the simd buckets => threadgroup sum
|
// For simd group 0 at indices < num simd groups, extract the shared
|
||||||
|
// simd sum
|
||||||
sumf = 0.0f;
|
sumf = 0.0f;
|
||||||
for (int64_t i0 = 0; i0 < sgptg; ++i0) {
|
if (sgitg == 0) {
|
||||||
sumf += shared[i0];
|
if (tiisg < sgptg) {
|
||||||
|
sumf = shared[tiisg];
|
||||||
}
|
}
|
||||||
|
sumf = simd_sum(sumf);
|
||||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
if (tiisg == 0) {
|
||||||
|
|
||||||
y[0] = sumf;
|
y[0] = sumf;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// recurse
|
// recurse
|
||||||
s0 = s;
|
s0 = s;
|
||||||
|
|||||||
Reference in New Issue
Block a user