CUDA: Optimize PAD_REFLECT_1D (#15957)

* CUDA: Optimize PAD_REFLECT_1D
feat: add more test cases for PAD_REFLECT_1D

* use fast_div to improve performance

* Apply suggestion from JohannesGaessler

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Apply suggestion from JohannesGaessler

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* optimize

* use a concise expression to further speedup the cuda kernel

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
Bowen Han
2025-09-18 11:26:03 -07:00
committed by GitHub
parent 368560a1e3
commit 38dbdf4c05
3 changed files with 76 additions and 54 deletions

View File

@@ -652,6 +652,14 @@ static __device__ __forceinline__ uint32_t fastmodulo(uint32_t n, const uint3 fa
return n - fastdiv(n, fastdiv_values) * fastdiv_values.z;
}
// Calculate both division and modulo at once, returns <n/divisor, n%divisor>
static __device__ __forceinline__ uint2 fast_div_modulo(uint32_t n, const uint3 fastdiv_values) {
// expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
const uint32_t div_val = fastdiv(n, fastdiv_values);
const uint32_t mod_val = n - div_val * fastdiv_values.z;
return make_uint2(div_val, mod_val);
}
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);
static __device__ __forceinline__ float get_alibi_slope(