CUDA: fix logic for clearing padding with -ngl 0 (#13320)

This commit is contained in:
Johannes Gäßler
2025-05-05 22:32:13 +02:00
committed by GitHub
parent 233461f812
commit 9070365020
6 changed files with 33 additions and 6 deletions

View File

@@ -163,6 +163,7 @@ void quantize_mmq_q8_1_cuda(
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
GGML_ASSERT(ne00 % 4 == 0);
GGML_ASSERT(ne0 % (4*QK8_1) == 0);
const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);