mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	vulkan: Replace uses of maxMemoryAllocationSize and VK_WHOLE_SIZE (#16354)
* vulkan: Replace uses of maxMemoryAllocationSize and VK_WHOLE_SIZE Replace maxMemoryAllocationSize check with maxBufferSize when creating buffers. The maxMemoryAllocationSize limit is a "soft" limit and allocations can succeed beyond that limit. This allows > 4GB buffers to be allocated on some implementations (e.g. NVIDIA) and tensors this large can be used for im2col and mul_mat. For temporary buffers (prealloc_x/y/etc) check against maxStorageBufferRange. I'm not sure this check is ideal, but we always use these buffers as a single full size binding and the limit may be smaller than maxMemoryAllocationSize or maxBufferSize, so I think this is reasonable. Replace descriptor range uses of VK_WHOLE_SIZE with a manually computed range. The maxStorageBufferRange may be smaller than the maxBufferSize or maxMemoryAllocationSize (and the Vulkan spec warns about this in a note) and it's invalid usage if VK_WHOLE_SIZE computes a range larger than maxStorageBufferRange. With this change, it should be possible to generate videos using wan networks in stable-diffusion.cpp. * vulkan: Add env var GGML_VK_FORCE_MAX_BUFFER_SIZE and use stoull
This commit is contained in:
		| @@ -393,6 +393,7 @@ struct vk_device_struct { | ||||
|     vk::PhysicalDeviceProperties properties; | ||||
|     std::string name; | ||||
|     uint64_t max_memory_allocation_size; | ||||
|     uint64_t max_buffer_size; | ||||
|     uint64_t suballocation_block_size; | ||||
|     bool fp16; | ||||
|     bool bf16; | ||||
| @@ -1563,6 +1564,12 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx | ||||
|  | ||||
| static void ggml_backend_vk_free(ggml_backend_t backend); | ||||
|  | ||||
| static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) { | ||||
|     const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset}, | ||||
|                                         VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange}); | ||||
|     return range; | ||||
| } | ||||
|  | ||||
| // Wait for ctx->fence to be signaled. | ||||
| static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) { | ||||
|     // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep | ||||
| @@ -2012,8 +2019,8 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr | ||||
|  | ||||
| static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list) { | ||||
|     VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")"); | ||||
|     if (size > device->max_memory_allocation_size) { | ||||
|         throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit"); | ||||
|     if (size > device->max_buffer_size) { | ||||
|         throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit"); | ||||
|     } | ||||
|  | ||||
|     vk_buffer buf = std::make_shared<vk_buffer_struct>(); | ||||
| @@ -2159,8 +2166,8 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf) { | ||||
|     buf.reset(); | ||||
| } | ||||
|  | ||||
| static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { | ||||
|     return { buf, 0, VK_WHOLE_SIZE }; | ||||
| static vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset = 0) { | ||||
|     return { buf, offset, ggml_vk_get_max_buffer_range(ctx, buf, offset) }; | ||||
| } | ||||
|  | ||||
| static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) { | ||||
| @@ -3853,17 +3860,27 @@ static vk_device ggml_vk_get_device(size_t idx) { | ||||
|         const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); | ||||
|  | ||||
|         if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) { | ||||
|             device->max_memory_allocation_size = std::stoul(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); | ||||
|             device->max_memory_allocation_size = std::stoull(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); | ||||
|         } else if (maintenance4_support) { | ||||
|             device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); | ||||
|         } else { | ||||
|             device->max_memory_allocation_size = props3.maxMemoryAllocationSize; | ||||
|         } | ||||
|  | ||||
|         const char* GGML_VK_FORCE_MAX_BUFFER_SIZE = getenv("GGML_VK_FORCE_MAX_BUFFER_SIZE"); | ||||
|  | ||||
|         if (GGML_VK_FORCE_MAX_BUFFER_SIZE != nullptr) { | ||||
|             device->max_buffer_size = std::stoull(GGML_VK_FORCE_MAX_BUFFER_SIZE); | ||||
|         } else if (maintenance4_support) { | ||||
|             device->max_buffer_size = props4.maxBufferSize; | ||||
|         } else { | ||||
|             device->max_buffer_size = device->max_memory_allocation_size; | ||||
|         } | ||||
|  | ||||
|         const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE"); | ||||
|  | ||||
|         if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) { | ||||
|             device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE); | ||||
|             device->suballocation_block_size = std::stoull(GGML_VK_SUBALLOCATION_BLOCK_SIZE); | ||||
|         } else { | ||||
|             // Limit batching of allocations to 1GB by default to avoid fragmentation issues | ||||
|             device->suballocation_block_size = 1024*1024*1024; | ||||
| @@ -6148,9 +6165,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | ||||
|         } | ||||
|         const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0; | ||||
|         if ( | ||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || | ||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) || | ||||
|                 (split_k > 1 && split_k_size > ctx->device->max_memory_allocation_size)) { | ||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) || | ||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) || | ||||
|                 (split_k > 1 && split_k_size > ctx->device->properties.limits.maxStorageBufferRange)) { | ||||
|             GGML_ABORT("Requested preallocation size is too large"); | ||||
|         } | ||||
|         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { | ||||
| @@ -6225,7 +6242,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | ||||
|     } | ||||
|  | ||||
|     if (x_non_contig) { | ||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); | ||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); | ||||
|     } else if (qx_needs_dequant) { | ||||
|         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; | ||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); | ||||
| @@ -6237,7 +6254,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | ||||
|             if (ctx->prealloc_y_need_sync) { | ||||
|                 ggml_vk_sync_buffers(ctx, subctx); | ||||
|             } | ||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); | ||||
|             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||
|             ctx->prealloc_y_last_tensor_used = src1; | ||||
|         } | ||||
| @@ -6248,7 +6265,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | ||||
|             if (ctx->prealloc_y_need_sync) { | ||||
|                 ggml_vk_sync_buffers(ctx, subctx); | ||||
|             } | ||||
|             ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true); | ||||
|             ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne * ne12 * ne13, true); | ||||
|             ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); | ||||
|             ctx->prealloc_y_last_tensor_used = src1; | ||||
|         } | ||||
| @@ -6270,14 +6287,11 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | ||||
|         y_sz_total = CEIL_DIV(y_sz_total, 144) * 144; | ||||
|     } | ||||
|  | ||||
|     // No bounds checking is needed for dst. This is basically VK_WHOLE_SIZE but clamped to maxStorageBufferRange. | ||||
|     VkDeviceSize d_range = std::min(VkDeviceSize{d_D->size - d_buf_offset}, VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange}); | ||||
|  | ||||
|     // compute | ||||
|     ggml_vk_matmul( | ||||
|         ctx, subctx, pipeline, | ||||
|         { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz_total }, | ||||
|         { d_D, d_buf_offset, d_range }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, | ||||
|         ggml_vk_subbuffer(ctx, d_D, d_buf_offset), { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, | ||||
|         ne01, ne11, ne10, | ||||
|         ne10, ne10, stride_d, stride_batch_x, stride_batch_y, stride_batch_d, | ||||
|         split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n | ||||
| @@ -6444,8 +6458,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | ||||
|             y_sz_upd = CEIL_DIV(y_sz_upd, 144) * 144; | ||||
|         } | ||||
|         if ( | ||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || | ||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) { | ||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) || | ||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) { | ||||
|             GGML_ABORT("Requested preallocation size is too large"); | ||||
|         } | ||||
|         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { | ||||
| @@ -6510,7 +6524,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | ||||
|         } | ||||
|  | ||||
|         GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); | ||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); | ||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); | ||||
|     } | ||||
|     if (y_non_contig) { | ||||
|         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); | ||||
| @@ -6519,7 +6533,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | ||||
|             if (ctx->prealloc_y_need_sync) { | ||||
|                 ggml_vk_sync_buffers(ctx, subctx); | ||||
|             } | ||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); | ||||
|             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||
|             ctx->prealloc_y_last_tensor_used = src1; | ||||
|         } | ||||
| @@ -6530,7 +6544,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | ||||
|             if (ctx->prealloc_y_need_sync) { | ||||
|                 ggml_vk_sync_buffers(ctx, subctx); | ||||
|             } | ||||
|             ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true); | ||||
|             ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne * ne12 * ne13, true); | ||||
|             ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); | ||||
|             ctx->prealloc_y_last_tensor_used = src1; | ||||
|         } | ||||
| @@ -6929,8 +6943,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& | ||||
|         const uint64_t x_sz_upd = x_sz * ne02 * ne03; | ||||
|         const uint64_t y_sz_upd = y_sz * ne12 * ne13; | ||||
|         if ( | ||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || | ||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) { | ||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) || | ||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) { | ||||
|             GGML_ABORT("Requested preallocation size is too large"); | ||||
|         } | ||||
|         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { | ||||
| @@ -6997,7 +7011,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& | ||||
|     } | ||||
|  | ||||
|     if (x_non_contig) { | ||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); | ||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); | ||||
|     } else if (qx_needs_dequant) { | ||||
|         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; | ||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, | ||||
| @@ -7010,7 +7024,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& | ||||
|             if (ctx->prealloc_y_need_sync) { | ||||
|                 ggml_vk_sync_buffers(ctx, subctx); | ||||
|             } | ||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); | ||||
|             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||
|             ctx->prealloc_y_last_tensor_used = src1; | ||||
|         } | ||||
| @@ -7143,8 +7157,8 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte | ||||
|         const uint64_t x_sz_upd = x_sz * ne02 * ne03; | ||||
|         const uint64_t y_sz_upd = y_sz * ne12 * ne13; | ||||
|         if ( | ||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || | ||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) { | ||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) || | ||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) { | ||||
|             GGML_ABORT("Requested preallocation size is too large"); | ||||
|         } | ||||
|         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { | ||||
| @@ -7210,7 +7224,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte | ||||
|  | ||||
|     if (x_non_contig) { | ||||
|         GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); | ||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); | ||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); | ||||
|     } | ||||
|     if (y_non_contig) { | ||||
|         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); | ||||
| @@ -7219,7 +7233,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte | ||||
|             if (ctx->prealloc_y_need_sync) { | ||||
|                 ggml_vk_sync_buffers(ctx, subctx); | ||||
|             } | ||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); | ||||
|             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||
|             ctx->prealloc_y_last_tensor_used = src1; | ||||
|         } | ||||
| @@ -7494,7 +7508,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx | ||||
|     // Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1) | ||||
|     // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows. | ||||
|     const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0; | ||||
|     if (split_k_size > ctx->device->max_memory_allocation_size) { | ||||
|     if (split_k_size > ctx->device->properties.limits.maxStorageBufferRange) { | ||||
|         GGML_ABORT("Requested preallocation size is too large"); | ||||
|     } | ||||
|     if (ctx->prealloc_size_split_k < split_k_size) { | ||||
| @@ -7616,12 +7630,12 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx | ||||
|  | ||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, | ||||
|                                     { | ||||
|                                         vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, | ||||
|                                         ggml_vk_subbuffer(ctx, d_Q, q_buf_offset), | ||||
|                                         ggml_vk_subbuffer(ctx, d_K, k_buf_offset), | ||||
|                                         ggml_vk_subbuffer(ctx, d_V, v_buf_offset), | ||||
|                                         ggml_vk_subbuffer(ctx, d_M, m_buf_offset), | ||||
|                                         ggml_vk_subbuffer(ctx, d_S, s_buf_offset), | ||||
|                                         ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0), | ||||
|                                     }, | ||||
|                                     // We only use split_k when group query attention is enabled, which means | ||||
|                                     // there's no more than one tile of rows (i.e. workgroups_x would have been | ||||
| @@ -7633,21 +7647,21 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx | ||||
|         const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) }; | ||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce, | ||||
|                                     { | ||||
|                                         vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0), | ||||
|                                         ggml_vk_subbuffer(ctx, d_S, s_buf_offset), | ||||
|                                         ggml_vk_subbuffer(ctx, d_D, d_buf_offset), | ||||
|                                     }, | ||||
|                                     pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 }); | ||||
|         ctx->prealloc_split_k_need_sync = true; | ||||
|     } else { | ||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, | ||||
|                                     { | ||||
|                                         vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, | ||||
|                                         ggml_vk_subbuffer(ctx, d_Q, q_buf_offset), | ||||
|                                         ggml_vk_subbuffer(ctx, d_K, k_buf_offset), | ||||
|                                         ggml_vk_subbuffer(ctx, d_V, v_buf_offset), | ||||
|                                         ggml_vk_subbuffer(ctx, d_M, m_buf_offset), | ||||
|                                         ggml_vk_subbuffer(ctx, d_S, s_buf_offset), | ||||
|                                         ggml_vk_subbuffer(ctx, d_D, d_buf_offset), | ||||
|                                     }, | ||||
|                                     pc, { workgroups_x, workgroups_y, workgroups_z }); | ||||
|     } | ||||
| @@ -8356,18 +8370,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     uint64_t x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0; | ||||
|     uint64_t y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 : 0; | ||||
|     uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0; | ||||
|     uint64_t d_sz = ggml_type_size(dst->type) * ned; | ||||
|  | ||||
|     vk_buffer d_D = dst_buf_ctx->dev_buffer; | ||||
|  | ||||
|     // Workaround for tiny tensor inputs on ROPE | ||||
|     if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) { | ||||
|         y_sz = VK_WHOLE_SIZE; | ||||
|     } | ||||
|  | ||||
|     GGML_ASSERT(d_D != nullptr); | ||||
|     uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; | ||||
|     if(!src0_uma) { | ||||
| @@ -8392,26 +8396,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | ||||
|     z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); | ||||
|     d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); | ||||
|  | ||||
|     if (op_supports_incontiguous) { | ||||
|         x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0); | ||||
|         y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0; | ||||
|         z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0; | ||||
|         d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst); | ||||
|  | ||||
|         if (x_buf_offset + x_sz >= d_X->size) { | ||||
|             x_sz = VK_WHOLE_SIZE; | ||||
|         } | ||||
|         if (use_src1 && y_buf_offset + y_sz >= d_Y->size) { | ||||
|             y_sz = VK_WHOLE_SIZE; | ||||
|         } | ||||
|         if (use_src2 && z_buf_offset + z_sz >= d_Z->size) { | ||||
|             z_sz = VK_WHOLE_SIZE; | ||||
|         } | ||||
|         if (d_buf_offset + d_sz >= d_D->size) { | ||||
|             d_sz = VK_WHOLE_SIZE; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     std::array<uint32_t, 3> elements; | ||||
|  | ||||
|     // Single call if dimension 2 is contiguous | ||||
| @@ -8602,19 +8586,31 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | ||||
|         break; | ||||
|     } | ||||
|  | ||||
|     if (!op_supports_incontiguous) { | ||||
|         if (x_sz != VK_WHOLE_SIZE) { | ||||
|             x_sz *= ne02 * ne03; | ||||
|     uint64_t x_sz, y_sz, z_sz, d_sz; | ||||
|  | ||||
|     if (op_supports_incontiguous) { | ||||
|         x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0); | ||||
|         y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0; | ||||
|         z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0; | ||||
|         d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst); | ||||
|  | ||||
|         if (x_buf_offset + x_sz >= d_X->size) { | ||||
|             x_sz = ggml_vk_get_max_buffer_range(ctx, d_X, x_buf_offset); | ||||
|         } | ||||
|         if (use_src1 && y_sz != VK_WHOLE_SIZE) { | ||||
|             y_sz *= ne12 * ne13; | ||||
|         if (use_src1 && y_buf_offset + y_sz >= d_Y->size) { | ||||
|             y_sz = ggml_vk_get_max_buffer_range(ctx, d_Y, y_buf_offset); | ||||
|         } | ||||
|         if (use_src2 && z_sz != VK_WHOLE_SIZE) { | ||||
|             z_sz *= ne22 * ne23; | ||||
|         if (use_src2 && z_buf_offset + z_sz >= d_Z->size) { | ||||
|             z_sz = ggml_vk_get_max_buffer_range(ctx, d_Z, z_buf_offset); | ||||
|         } | ||||
|         if (d_sz != VK_WHOLE_SIZE) { | ||||
|             d_sz *= ned2 * ned3; | ||||
|         if (d_buf_offset + d_sz >= d_D->size) { | ||||
|             d_sz = ggml_vk_get_max_buffer_range(ctx, d_D, d_buf_offset); | ||||
|         } | ||||
|     } else { | ||||
|         x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0 * ne02 * ne03; | ||||
|         y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 * ne12 * ne13 : 0; | ||||
|         z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 * ne22 * ne23 : 0; | ||||
|         d_sz = ggml_type_size(dst->type) * ned * ned2 * ned3; | ||||
|     } | ||||
|  | ||||
|     if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) { | ||||
| @@ -8624,7 +8620,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | ||||
|             { vk_subbuffer{ d_X, x_buf_offset, x_sz }, | ||||
|               vk_subbuffer{ d_Y, y_buf_offset, y_sz }, | ||||
|               vk_subbuffer{ d_D, d_buf_offset, d_sz }, | ||||
|               vk_subbuffer{ d_A, a_buf_offset, VK_WHOLE_SIZE }, | ||||
|               ggml_vk_subbuffer(ctx, d_A, a_buf_offset), | ||||
|             }, pc, elements); | ||||
|     } else if (op == GGML_OP_GLU) { | ||||
|         // Empty src1 is possible in glu, but the shader needs a buffer | ||||
| @@ -8817,18 +8813,18 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, | ||||
|     static_assert(MAX_PARAMETER_COUNT == 12); | ||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, | ||||
|         { | ||||
|             vk_subbuffer{ buf[0], offset[0], VK_WHOLE_SIZE }, | ||||
|             vk_subbuffer{ buf[1], offset[1], VK_WHOLE_SIZE }, | ||||
|             vk_subbuffer{ buf[2], offset[2], VK_WHOLE_SIZE }, | ||||
|             vk_subbuffer{ buf[3], offset[3], VK_WHOLE_SIZE }, | ||||
|             vk_subbuffer{ buf[4], offset[4], VK_WHOLE_SIZE }, | ||||
|             vk_subbuffer{ buf[5], offset[5], VK_WHOLE_SIZE }, | ||||
|             vk_subbuffer{ buf[6], offset[6], VK_WHOLE_SIZE }, | ||||
|             vk_subbuffer{ buf[7], offset[7], VK_WHOLE_SIZE }, | ||||
|             vk_subbuffer{ buf[8], offset[8], VK_WHOLE_SIZE }, | ||||
|             vk_subbuffer{ buf[9], offset[9], VK_WHOLE_SIZE }, | ||||
|             vk_subbuffer{ buf[10], offset[10], VK_WHOLE_SIZE }, | ||||
|             vk_subbuffer{ buf[11], offset[11], VK_WHOLE_SIZE }, | ||||
|             ggml_vk_subbuffer(ctx, buf[0], offset[0]), | ||||
|             ggml_vk_subbuffer(ctx, buf[1], offset[1]), | ||||
|             ggml_vk_subbuffer(ctx, buf[2], offset[2]), | ||||
|             ggml_vk_subbuffer(ctx, buf[3], offset[3]), | ||||
|             ggml_vk_subbuffer(ctx, buf[4], offset[4]), | ||||
|             ggml_vk_subbuffer(ctx, buf[5], offset[5]), | ||||
|             ggml_vk_subbuffer(ctx, buf[6], offset[6]), | ||||
|             ggml_vk_subbuffer(ctx, buf[7], offset[7]), | ||||
|             ggml_vk_subbuffer(ctx, buf[8], offset[8]), | ||||
|             ggml_vk_subbuffer(ctx, buf[9], offset[9]), | ||||
|             ggml_vk_subbuffer(ctx, buf[10], offset[10]), | ||||
|             ggml_vk_subbuffer(ctx, buf[11], offset[11]), | ||||
|         }, pc, elements); | ||||
| } | ||||
|  | ||||
| @@ -10002,7 +9998,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t | ||||
|     ggml_vk_ctx_begin(ctx->device, subctx); | ||||
|     for (size_t i = 0; i < num_it; i++) { | ||||
|         ggml_vk_matmul( | ||||
|             ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), | ||||
|             ctx, subctx, p, ggml_vk_subbuffer(ctx, d_X), ggml_vk_subbuffer(ctx, d_Y), ggml_vk_subbuffer(ctx, d_D), ggml_vk_subbuffer(ctx, ctx->prealloc_split_k), | ||||
|             m, n, k, | ||||
|             k, k, m, k*m, k*n, m*n, | ||||
|             split_k, batch, batch, batch, 1, 1, n | ||||
| @@ -10313,7 +10309,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ | ||||
| // | ||||
| //     vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); | ||||
| //     ggml_vk_ctx_begin(ctx->device, subctx); | ||||
| //     ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne); | ||||
| //     ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, x_buf), ggml_vk_subbuffer(ctx, qx_buf), ne); | ||||
| //     ggml_vk_ctx_end(subctx); | ||||
| // | ||||
| //     auto begin = std::chrono::high_resolution_clock::now(); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user