mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-11 10:36:54 +00:00
cuda: make im2col a little faster
This commit is contained in:
@@ -133,7 +133,7 @@ static __global__ void im2col_3d_kernel(
|
|||||||
const int64_t ikw = i % KW;
|
const int64_t ikw = i % KW;
|
||||||
|
|
||||||
const int64_t ioh = blockIdx.y;
|
const int64_t ioh = blockIdx.y;
|
||||||
for (int64_t iz = blockIdx.z; iz < N_OD_IC; iz+=MAX_GRID_DIM_Z) {
|
for (int64_t iz = blockIdx.z; iz < N_OD_IC; iz+=MAX_GRIDDIM_Z) {
|
||||||
const int64_t in = iz / OD_IC;
|
const int64_t in = iz / OD_IC;
|
||||||
const int64_t iod = (iz - in*OD_IC) / IC;
|
const int64_t iod = (iz - in*OD_IC) / IC;
|
||||||
const int64_t iic = iz % IC;
|
const int64_t iic = iz % IC;
|
||||||
@@ -169,7 +169,7 @@ static void im2col_3d_cuda(const float * src, T* dst,
|
|||||||
const int64_t N_OD_IC = N*OD*IC;
|
const int64_t N_OD_IC = N*OD*IC;
|
||||||
const int64_t OD_IC = OD*IC;
|
const int64_t OD_IC = OD*IC;
|
||||||
const int64_t num_blocks = (OW_KD_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
const int64_t num_blocks = (OW_KD_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
||||||
dim3 block_nums(num_blocks, OH, MIN(N_OD_IC, MAX_GRID_DIM_Z));
|
dim3 block_nums(num_blocks, OH, MIN(N_OD_IC, MAX_GRIDDIM_Z));
|
||||||
im2col_3d_kernel<<<block_nums, MIN(OW_KD_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
|
im2col_3d_kernel<<<block_nums, MIN(OW_KD_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
|
||||||
OH_OW, KD_KH_KW, ID_IH_IW, KH_KW, IH_IW, IC_KD_KH_KW,
|
OH_OW, KD_KH_KW, ID_IH_IW, KH_KW, IH_IW, IC_KD_KH_KW,
|
||||||
OW_KD_KH_KW, N_OD_IC, OD_IC, s0, s1, s2, p0, p1, p2, d0, d1, d2);
|
OW_KD_KH_KW, N_OD_IC, OD_IC, s0, s1, s2, p0, p1, p2, d0, d1, d2);
|
||||||
|
|||||||
Reference in New Issue
Block a user