CUDA: batched+noncont MMQ, refactor bs>1 MoE code (#13199)

This commit is contained in:
Johannes Gäßler
2025-04-30 23:12:59 +02:00
committed by GitHub
parent 6f67cf1f48
commit e1e8e0991f
9 changed files with 869 additions and 440 deletions

View File

@@ -158,7 +158,7 @@ static __global__ void mul_mat_vec_q(
const int blocks_per_row_x = ncols_x / qk;
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
// The MUL_MAT_ID code path with ids != nullptr is only implemetned for ncols_dst == 1.
// The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1.
const int channel_dst = blockIdx.y;
const int channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : channel_dst / channel_ratio;
const int channel_y = ncols_dst == 1 && ids ? channel_dst % nchannels_y : channel_dst;
@@ -507,7 +507,7 @@ void ggml_cuda_mul_mat_vec_q(
GGML_ASSERT( nb0 == ts_dst);
GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
const float * src1_d = (const float *) src1->data;
const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr;
@@ -519,7 +519,7 @@ void ggml_cuda_mul_mat_vec_q(
const int64_t s11 = src1->nb[1] / ts_src1;
const int64_t s12 = src1->nb[2] / ts_src1;
const int64_t s13 = src1->nb[3] / ts_src1;
quantize_row_q8_1_cuda(src1_d, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
quantize_row_q8_1_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
}
const int64_t s01 = src0->nb[1] / ts_src0;