From a80ff183abe4e5a76316257ffa597da41b3b6fa0 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 6 Oct 2025 14:17:12 +0200 Subject: [PATCH] ggml-cpu : fix leftover handling in ggml_vec_scale_f32 for SVE (#16443) This commit updates the leftover handling in ggml_vec_scale_f32. The motivation for this is that the code currently incorrectly assumes there would be fewer than ggml_f32_epr leftover elements. However, since the main loop processes 2*ggml_f32_epr elements per iteration , there can be up to (2*ggml_f32_epr - 1) leftover elements. The original single-pass leftover code could only process ggml_f32_epr elements, leaving some elements unscaled. Example scenario with 256-bit SVE: ``` ggml_f32_epr = 8 (elements per register) ggml_f32_step = 16 (two registers per iteration) n = 25 np = 16 leftovers = 9 elements (16-24) Original : processes only elements 16-23, misses element 24 This commit : loop processes elements 16-23, then element 24 ``` Refs: https://github.com/ggml-org/llama.cpp/actions/runs/18070620247/job/51419855630 --- ggml/src/ggml-cpu/vec.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 341e64e64f..f95ca94e54 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -654,11 +654,11 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { } // leftovers // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only - if (np < n) { - svbool_t pg = svwhilelt_b32(np, n); - ay1 = svld1_f32(pg, y + np); + for (int i = np; i < n; i += ggml_f32_epr) { + svbool_t pg = svwhilelt_b32(i, n); + ay1 = svld1_f32(pg, y + i); ay1 = svmul_f32_m(pg, ay1, vx); - svst1_f32(pg, y + np, ay1); + svst1_f32(pg, y + i, ay1); } #elif defined(__riscv_v_intrinsic) for (int i = 0, avl; i < n; i += avl) {