mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-15 11:17:31 +00:00
CUDA: fix crash with partial offloading of MoE (#13439)
This commit is contained in:
@@ -515,11 +515,11 @@ void ggml_cuda_mul_mat_vec_q(
|
||||
|
||||
// If src0 is a temporary compute buffer, clear any potential padding.
|
||||
if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
const size_t size_data = ggml_nbytes(src0);
|
||||
const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
|
||||
if (size_alloc > size_data) {
|
||||
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
|
||||
GGML_ASSERT(!src0->view_src);
|
||||
CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user