CUDA: Optimize PAD_REFLECT_1D (#15957)

* CUDA: Optimize PAD_REFLECT_1D feat: add more test cases for PAD_REFLECT_1D * use fast_div to improve performance * Apply suggestion from JohannesGaessler Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Apply suggestion from JohannesGaessler Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * optimize * use a concise expression to further speedup the cuda kernel --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2025-11-17 11:37:10 +00:00 · 2025-09-18 11:26:03 -07:00
parent 368560a1e3
commit 38dbdf4c05
3 changed files with 76 additions and 54 deletions
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -652,6 +652,14 @@ static __device__ __forceinline__ uint32_t fastmodulo(uint32_t n, const uint3 fa
    return n - fastdiv(n, fastdiv_values) * fastdiv_values.z;
 }

+// Calculate both division and modulo at once, returns <n/divisor, n%divisor>
+static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3 fastdiv_values) {
+    // expects  fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
+    const uint32_t div_val = fastdiv(n, fastdiv_values);
+    const uint32_t mod_val = n - div_val * fastdiv_values.z;
+    return make_uint2(div_val, mod_val);
+}
+
 typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);

 static __device__ __forceinline__ float get_alibi_slope(