mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	vulkan: Replace uses of maxMemoryAllocationSize and VK_WHOLE_SIZE (#16354)
* vulkan: Replace uses of maxMemoryAllocationSize and VK_WHOLE_SIZE Replace maxMemoryAllocationSize check with maxBufferSize when creating buffers. The maxMemoryAllocationSize limit is a "soft" limit and allocations can succeed beyond that limit. This allows > 4GB buffers to be allocated on some implementations (e.g. NVIDIA) and tensors this large can be used for im2col and mul_mat. For temporary buffers (prealloc_x/y/etc) check against maxStorageBufferRange. I'm not sure this check is ideal, but we always use these buffers as a single full size binding and the limit may be smaller than maxMemoryAllocationSize or maxBufferSize, so I think this is reasonable. Replace descriptor range uses of VK_WHOLE_SIZE with a manually computed range. The maxStorageBufferRange may be smaller than the maxBufferSize or maxMemoryAllocationSize (and the Vulkan spec warns about this in a note) and it's invalid usage if VK_WHOLE_SIZE computes a range larger than maxStorageBufferRange. With this change, it should be possible to generate videos using wan networks in stable-diffusion.cpp. * vulkan: Add env var GGML_VK_FORCE_MAX_BUFFER_SIZE and use stoull
This commit is contained in:
		| @@ -393,6 +393,7 @@ struct vk_device_struct { | |||||||
|     vk::PhysicalDeviceProperties properties; |     vk::PhysicalDeviceProperties properties; | ||||||
|     std::string name; |     std::string name; | ||||||
|     uint64_t max_memory_allocation_size; |     uint64_t max_memory_allocation_size; | ||||||
|  |     uint64_t max_buffer_size; | ||||||
|     uint64_t suballocation_block_size; |     uint64_t suballocation_block_size; | ||||||
|     bool fp16; |     bool fp16; | ||||||
|     bool bf16; |     bool bf16; | ||||||
| @@ -1563,6 +1564,12 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx | |||||||
|  |  | ||||||
| static void ggml_backend_vk_free(ggml_backend_t backend); | static void ggml_backend_vk_free(ggml_backend_t backend); | ||||||
|  |  | ||||||
|  | static VkDeviceSize ggml_vk_get_max_buffer_range(const ggml_backend_vk_context * ctx, const vk_buffer &buf, const VkDeviceSize offset) { | ||||||
|  |     const VkDeviceSize range = std::min(VkDeviceSize{buf->size - offset}, | ||||||
|  |                                         VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange}); | ||||||
|  |     return range; | ||||||
|  | } | ||||||
|  |  | ||||||
| // Wait for ctx->fence to be signaled. | // Wait for ctx->fence to be signaled. | ||||||
| static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) { | static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) { | ||||||
|     // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep |     // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep | ||||||
| @@ -2012,8 +2019,8 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr | |||||||
|  |  | ||||||
| static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list) { | static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std::initializer_list<vk::MemoryPropertyFlags> & req_flags_list) { | ||||||
|     VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")"); |     VK_LOG_DEBUG("ggml_vk_create_buffer(" << device->name << ", " << size << ", " << to_string(req_flags_list.begin()[0]) << ", " << to_string(req_flags_list.begin()[req_flags_list.size()-1]) << ")"); | ||||||
|     if (size > device->max_memory_allocation_size) { |     if (size > device->max_buffer_size) { | ||||||
|         throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit"); |         throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device buffer size limit"); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     vk_buffer buf = std::make_shared<vk_buffer_struct>(); |     vk_buffer buf = std::make_shared<vk_buffer_struct>(); | ||||||
| @@ -2159,8 +2166,8 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf) { | |||||||
|     buf.reset(); |     buf.reset(); | ||||||
| } | } | ||||||
|  |  | ||||||
| static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { | static vk_subbuffer ggml_vk_subbuffer(const ggml_backend_vk_context* ctx, const vk_buffer& buf, size_t offset = 0) { | ||||||
|     return { buf, 0, VK_WHOLE_SIZE }; |     return { buf, offset, ggml_vk_get_max_buffer_range(ctx, buf, offset) }; | ||||||
| } | } | ||||||
|  |  | ||||||
| static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) { | static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) { | ||||||
| @@ -3853,17 +3860,27 @@ static vk_device ggml_vk_get_device(size_t idx) { | |||||||
|         const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); |         const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); | ||||||
|  |  | ||||||
|         if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) { |         if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) { | ||||||
|             device->max_memory_allocation_size = std::stoul(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); |             device->max_memory_allocation_size = std::stoull(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); | ||||||
|         } else if (maintenance4_support) { |         } else if (maintenance4_support) { | ||||||
|             device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); |             device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); | ||||||
|         } else { |         } else { | ||||||
|             device->max_memory_allocation_size = props3.maxMemoryAllocationSize; |             device->max_memory_allocation_size = props3.maxMemoryAllocationSize; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         const char* GGML_VK_FORCE_MAX_BUFFER_SIZE = getenv("GGML_VK_FORCE_MAX_BUFFER_SIZE"); | ||||||
|  |  | ||||||
|  |         if (GGML_VK_FORCE_MAX_BUFFER_SIZE != nullptr) { | ||||||
|  |             device->max_buffer_size = std::stoull(GGML_VK_FORCE_MAX_BUFFER_SIZE); | ||||||
|  |         } else if (maintenance4_support) { | ||||||
|  |             device->max_buffer_size = props4.maxBufferSize; | ||||||
|  |         } else { | ||||||
|  |             device->max_buffer_size = device->max_memory_allocation_size; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE"); |         const char* GGML_VK_SUBALLOCATION_BLOCK_SIZE = getenv("GGML_VK_SUBALLOCATION_BLOCK_SIZE"); | ||||||
|  |  | ||||||
|         if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) { |         if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) { | ||||||
|             device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE); |             device->suballocation_block_size = std::stoull(GGML_VK_SUBALLOCATION_BLOCK_SIZE); | ||||||
|         } else { |         } else { | ||||||
|             // Limit batching of allocations to 1GB by default to avoid fragmentation issues |             // Limit batching of allocations to 1GB by default to avoid fragmentation issues | ||||||
|             device->suballocation_block_size = 1024*1024*1024; |             device->suballocation_block_size = 1024*1024*1024; | ||||||
| @@ -6148,9 +6165,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | |||||||
|         } |         } | ||||||
|         const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0; |         const uint64_t split_k_size = split_k > 1 ? d_sz * ne12 * ne13 * split_k : 0; | ||||||
|         if ( |         if ( | ||||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || |                 (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) || | ||||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size) || |                 (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) || | ||||||
|                 (split_k > 1 && split_k_size > ctx->device->max_memory_allocation_size)) { |                 (split_k > 1 && split_k_size > ctx->device->properties.limits.maxStorageBufferRange)) { | ||||||
|             GGML_ABORT("Requested preallocation size is too large"); |             GGML_ABORT("Requested preallocation size is too large"); | ||||||
|         } |         } | ||||||
|         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { |         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { | ||||||
| @@ -6225,7 +6242,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (x_non_contig) { |     if (x_non_contig) { | ||||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); |         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); | ||||||
|     } else if (qx_needs_dequant) { |     } else if (qx_needs_dequant) { | ||||||
|         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; |         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; | ||||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); |         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); | ||||||
| @@ -6237,7 +6254,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | |||||||
|             if (ctx->prealloc_y_need_sync) { |             if (ctx->prealloc_y_need_sync) { | ||||||
|                 ggml_vk_sync_buffers(ctx, subctx); |                 ggml_vk_sync_buffers(ctx, subctx); | ||||||
|             } |             } | ||||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); |             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); | ||||||
|             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); |             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||||
|             ctx->prealloc_y_last_tensor_used = src1; |             ctx->prealloc_y_last_tensor_used = src1; | ||||||
|         } |         } | ||||||
| @@ -6248,7 +6265,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | |||||||
|             if (ctx->prealloc_y_need_sync) { |             if (ctx->prealloc_y_need_sync) { | ||||||
|                 ggml_vk_sync_buffers(ctx, subctx); |                 ggml_vk_sync_buffers(ctx, subctx); | ||||||
|             } |             } | ||||||
|             ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true); |             ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne * ne12 * ne13, true); | ||||||
|             ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); |             ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); | ||||||
|             ctx->prealloc_y_last_tensor_used = src1; |             ctx->prealloc_y_last_tensor_used = src1; | ||||||
|         } |         } | ||||||
| @@ -6270,14 +6287,11 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | |||||||
|         y_sz_total = CEIL_DIV(y_sz_total, 144) * 144; |         y_sz_total = CEIL_DIV(y_sz_total, 144) * 144; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // No bounds checking is needed for dst. This is basically VK_WHOLE_SIZE but clamped to maxStorageBufferRange. |  | ||||||
|     VkDeviceSize d_range = std::min(VkDeviceSize{d_D->size - d_buf_offset}, VkDeviceSize{ctx->device->properties.limits.maxStorageBufferRange}); |  | ||||||
|  |  | ||||||
|     // compute |     // compute | ||||||
|     ggml_vk_matmul( |     ggml_vk_matmul( | ||||||
|         ctx, subctx, pipeline, |         ctx, subctx, pipeline, | ||||||
|         { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz_total }, |         { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz_total }, | ||||||
|         { d_D, d_buf_offset, d_range }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, |         ggml_vk_subbuffer(ctx, d_D, d_buf_offset), { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, | ||||||
|         ne01, ne11, ne10, |         ne01, ne11, ne10, | ||||||
|         ne10, ne10, stride_d, stride_batch_x, stride_batch_y, stride_batch_d, |         ne10, ne10, stride_d, stride_batch_x, stride_batch_y, stride_batch_d, | ||||||
|         split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n |         split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n | ||||||
| @@ -6444,8 +6458,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|             y_sz_upd = CEIL_DIV(y_sz_upd, 144) * 144; |             y_sz_upd = CEIL_DIV(y_sz_upd, 144) * 144; | ||||||
|         } |         } | ||||||
|         if ( |         if ( | ||||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || |                 (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) || | ||||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) { |                 (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) { | ||||||
|             GGML_ABORT("Requested preallocation size is too large"); |             GGML_ABORT("Requested preallocation size is too large"); | ||||||
|         } |         } | ||||||
|         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { |         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { | ||||||
| @@ -6510,7 +6524,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); |         GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); | ||||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); |         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); | ||||||
|     } |     } | ||||||
|     if (y_non_contig) { |     if (y_non_contig) { | ||||||
|         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); |         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); | ||||||
| @@ -6519,7 +6533,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|             if (ctx->prealloc_y_need_sync) { |             if (ctx->prealloc_y_need_sync) { | ||||||
|                 ggml_vk_sync_buffers(ctx, subctx); |                 ggml_vk_sync_buffers(ctx, subctx); | ||||||
|             } |             } | ||||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); |             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); | ||||||
|             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); |             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||||
|             ctx->prealloc_y_last_tensor_used = src1; |             ctx->prealloc_y_last_tensor_used = src1; | ||||||
|         } |         } | ||||||
| @@ -6530,7 +6544,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|             if (ctx->prealloc_y_need_sync) { |             if (ctx->prealloc_y_need_sync) { | ||||||
|                 ggml_vk_sync_buffers(ctx, subctx); |                 ggml_vk_sync_buffers(ctx, subctx); | ||||||
|             } |             } | ||||||
|             ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true); |             ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0), y_ne * ne12 * ne13, true); | ||||||
|             ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); |             ctx->prealloc_y_last_pipeline_used = to_q8_1.get(); | ||||||
|             ctx->prealloc_y_last_tensor_used = src1; |             ctx->prealloc_y_last_tensor_used = src1; | ||||||
|         } |         } | ||||||
| @@ -6929,8 +6943,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|         const uint64_t x_sz_upd = x_sz * ne02 * ne03; |         const uint64_t x_sz_upd = x_sz * ne02 * ne03; | ||||||
|         const uint64_t y_sz_upd = y_sz * ne12 * ne13; |         const uint64_t y_sz_upd = y_sz * ne12 * ne13; | ||||||
|         if ( |         if ( | ||||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || |                 (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) || | ||||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) { |                 (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) { | ||||||
|             GGML_ABORT("Requested preallocation size is too large"); |             GGML_ABORT("Requested preallocation size is too large"); | ||||||
|         } |         } | ||||||
|         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { |         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { | ||||||
| @@ -6997,7 +7011,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (x_non_contig) { |     if (x_non_contig) { | ||||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); |         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); | ||||||
|     } else if (qx_needs_dequant) { |     } else if (qx_needs_dequant) { | ||||||
|         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; |         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; | ||||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, |         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, | ||||||
| @@ -7010,7 +7024,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|             if (ctx->prealloc_y_need_sync) { |             if (ctx->prealloc_y_need_sync) { | ||||||
|                 ggml_vk_sync_buffers(ctx, subctx); |                 ggml_vk_sync_buffers(ctx, subctx); | ||||||
|             } |             } | ||||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); |             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); | ||||||
|             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); |             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||||
|             ctx->prealloc_y_last_tensor_used = src1; |             ctx->prealloc_y_last_tensor_used = src1; | ||||||
|         } |         } | ||||||
| @@ -7143,8 +7157,8 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte | |||||||
|         const uint64_t x_sz_upd = x_sz * ne02 * ne03; |         const uint64_t x_sz_upd = x_sz * ne02 * ne03; | ||||||
|         const uint64_t y_sz_upd = y_sz * ne12 * ne13; |         const uint64_t y_sz_upd = y_sz * ne12 * ne13; | ||||||
|         if ( |         if ( | ||||||
|                 (qx_needs_dequant && x_sz_upd > ctx->device->max_memory_allocation_size) || |                 (qx_needs_dequant && x_sz_upd > ctx->device->properties.limits.maxStorageBufferRange) || | ||||||
|                 (qy_needs_dequant && y_sz_upd > ctx->device->max_memory_allocation_size)) { |                 (qy_needs_dequant && y_sz_upd > ctx->device->properties.limits.maxStorageBufferRange)) { | ||||||
|             GGML_ABORT("Requested preallocation size is too large"); |             GGML_ABORT("Requested preallocation size is too large"); | ||||||
|         } |         } | ||||||
|         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { |         if (qx_needs_dequant && ctx->prealloc_size_x < x_sz_upd) { | ||||||
| @@ -7210,7 +7224,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte | |||||||
|  |  | ||||||
|     if (x_non_contig) { |     if (x_non_contig) { | ||||||
|         GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); |         GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); | ||||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); |         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, ggml_vk_subbuffer(ctx, d_Qx, qx_buf_offset), ggml_vk_subbuffer(ctx, d_X, 0)); | ||||||
|     } |     } | ||||||
|     if (y_non_contig) { |     if (y_non_contig) { | ||||||
|         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); |         GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); | ||||||
| @@ -7219,7 +7233,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte | |||||||
|             if (ctx->prealloc_y_need_sync) { |             if (ctx->prealloc_y_need_sync) { | ||||||
|                 ggml_vk_sync_buffers(ctx, subctx); |                 ggml_vk_sync_buffers(ctx, subctx); | ||||||
|             } |             } | ||||||
|             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); |             ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, ggml_vk_subbuffer(ctx, d_Qy, qy_buf_offset), ggml_vk_subbuffer(ctx, d_Y, 0)); | ||||||
|             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); |             ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1.get(); | ||||||
|             ctx->prealloc_y_last_tensor_used = src1; |             ctx->prealloc_y_last_tensor_used = src1; | ||||||
|         } |         } | ||||||
| @@ -7494,7 +7508,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx | |||||||
|     // Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1) |     // Reserve space for split_k temporaries. For each split x batch, we need to store the O matrix (D x ne1) | ||||||
|     // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows. |     // and the per-row m and L values (ne1 rows). We store all the matrices first, followed by the rows. | ||||||
|     const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0; |     const uint64_t split_k_size = split_k > 1 ? (HSV * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k * ne3 : 0; | ||||||
|     if (split_k_size > ctx->device->max_memory_allocation_size) { |     if (split_k_size > ctx->device->properties.limits.maxStorageBufferRange) { | ||||||
|         GGML_ABORT("Requested preallocation size is too large"); |         GGML_ABORT("Requested preallocation size is too large"); | ||||||
|     } |     } | ||||||
|     if (ctx->prealloc_size_split_k < split_k_size) { |     if (ctx->prealloc_size_split_k < split_k_size) { | ||||||
| @@ -7616,12 +7630,12 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx | |||||||
|  |  | ||||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, |         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, | ||||||
|                                     { |                                     { | ||||||
|                                         vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_Q, q_buf_offset), | ||||||
|                                         vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_K, k_buf_offset), | ||||||
|                                         vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_V, v_buf_offset), | ||||||
|                                         vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_M, m_buf_offset), | ||||||
|                                         vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_S, s_buf_offset), | ||||||
|                                         vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0), | ||||||
|                                     }, |                                     }, | ||||||
|                                     // We only use split_k when group query attention is enabled, which means |                                     // We only use split_k when group query attention is enabled, which means | ||||||
|                                     // there's no more than one tile of rows (i.e. workgroups_x would have been |                                     // there's no more than one tile of rows (i.e. workgroups_x would have been | ||||||
| @@ -7633,21 +7647,21 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx | |||||||
|         const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) }; |         const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) }; | ||||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce, |         ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce, | ||||||
|                                     { |                                     { | ||||||
|                                         vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0), | ||||||
|                                         vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_S, s_buf_offset), | ||||||
|                                         vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_D, d_buf_offset), | ||||||
|                                     }, |                                     }, | ||||||
|                                     pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 }); |                                     pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 }); | ||||||
|         ctx->prealloc_split_k_need_sync = true; |         ctx->prealloc_split_k_need_sync = true; | ||||||
|     } else { |     } else { | ||||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, |         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, | ||||||
|                                     { |                                     { | ||||||
|                                         vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_Q, q_buf_offset), | ||||||
|                                         vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_K, k_buf_offset), | ||||||
|                                         vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_V, v_buf_offset), | ||||||
|                                         vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_M, m_buf_offset), | ||||||
|                                         vk_subbuffer{d_S, s_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_S, s_buf_offset), | ||||||
|                                         vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, |                                         ggml_vk_subbuffer(ctx, d_D, d_buf_offset), | ||||||
|                                     }, |                                     }, | ||||||
|                                     pc, { workgroups_x, workgroups_y, workgroups_z }); |                                     pc, { workgroups_x, workgroups_y, workgroups_z }); | ||||||
|     } |     } | ||||||
| @@ -8356,18 +8370,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     uint64_t x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0; |  | ||||||
|     uint64_t y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 : 0; |  | ||||||
|     uint64_t z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 : 0; |  | ||||||
|     uint64_t d_sz = ggml_type_size(dst->type) * ned; |  | ||||||
|  |  | ||||||
|     vk_buffer d_D = dst_buf_ctx->dev_buffer; |     vk_buffer d_D = dst_buf_ctx->dev_buffer; | ||||||
|  |  | ||||||
|     // Workaround for tiny tensor inputs on ROPE |  | ||||||
|     if (op == GGML_OP_ROPE && use_src1 && y_sz > d_D->size) { |  | ||||||
|         y_sz = VK_WHOLE_SIZE; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     GGML_ASSERT(d_D != nullptr); |     GGML_ASSERT(d_D != nullptr); | ||||||
|     uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; |     uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; | ||||||
|     if(!src0_uma) { |     if(!src0_uma) { | ||||||
| @@ -8392,26 +8396,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
|     z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); |     z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); | ||||||
|     d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); |     d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); | ||||||
|  |  | ||||||
|     if (op_supports_incontiguous) { |  | ||||||
|         x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0); |  | ||||||
|         y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0; |  | ||||||
|         z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0; |  | ||||||
|         d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst); |  | ||||||
|  |  | ||||||
|         if (x_buf_offset + x_sz >= d_X->size) { |  | ||||||
|             x_sz = VK_WHOLE_SIZE; |  | ||||||
|         } |  | ||||||
|         if (use_src1 && y_buf_offset + y_sz >= d_Y->size) { |  | ||||||
|             y_sz = VK_WHOLE_SIZE; |  | ||||||
|         } |  | ||||||
|         if (use_src2 && z_buf_offset + z_sz >= d_Z->size) { |  | ||||||
|             z_sz = VK_WHOLE_SIZE; |  | ||||||
|         } |  | ||||||
|         if (d_buf_offset + d_sz >= d_D->size) { |  | ||||||
|             d_sz = VK_WHOLE_SIZE; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     std::array<uint32_t, 3> elements; |     std::array<uint32_t, 3> elements; | ||||||
|  |  | ||||||
|     // Single call if dimension 2 is contiguous |     // Single call if dimension 2 is contiguous | ||||||
| @@ -8602,19 +8586,31 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
|         break; |         break; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (!op_supports_incontiguous) { |     uint64_t x_sz, y_sz, z_sz, d_sz; | ||||||
|         if (x_sz != VK_WHOLE_SIZE) { |  | ||||||
|             x_sz *= ne02 * ne03; |     if (op_supports_incontiguous) { | ||||||
|  |         x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0); | ||||||
|  |         y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0; | ||||||
|  |         z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0; | ||||||
|  |         d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst); | ||||||
|  |  | ||||||
|  |         if (x_buf_offset + x_sz >= d_X->size) { | ||||||
|  |             x_sz = ggml_vk_get_max_buffer_range(ctx, d_X, x_buf_offset); | ||||||
|         } |         } | ||||||
|         if (use_src1 && y_sz != VK_WHOLE_SIZE) { |         if (use_src1 && y_buf_offset + y_sz >= d_Y->size) { | ||||||
|             y_sz *= ne12 * ne13; |             y_sz = ggml_vk_get_max_buffer_range(ctx, d_Y, y_buf_offset); | ||||||
|         } |         } | ||||||
|         if (use_src2 && z_sz != VK_WHOLE_SIZE) { |         if (use_src2 && z_buf_offset + z_sz >= d_Z->size) { | ||||||
|             z_sz *= ne22 * ne23; |             z_sz = ggml_vk_get_max_buffer_range(ctx, d_Z, z_buf_offset); | ||||||
|         } |         } | ||||||
|         if (d_sz != VK_WHOLE_SIZE) { |         if (d_buf_offset + d_sz >= d_D->size) { | ||||||
|             d_sz *= ned2 * ned3; |             d_sz = ggml_vk_get_max_buffer_range(ctx, d_D, d_buf_offset); | ||||||
|         } |         } | ||||||
|  |     } else { | ||||||
|  |         x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0 * ne02 * ne03; | ||||||
|  |         y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 * ne12 * ne13 : 0; | ||||||
|  |         z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 * ne22 * ne23 : 0; | ||||||
|  |         d_sz = ggml_type_size(dst->type) * ned * ned2 * ned3; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) { |     if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) { | ||||||
| @@ -8624,7 +8620,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
|             { vk_subbuffer{ d_X, x_buf_offset, x_sz }, |             { vk_subbuffer{ d_X, x_buf_offset, x_sz }, | ||||||
|               vk_subbuffer{ d_Y, y_buf_offset, y_sz }, |               vk_subbuffer{ d_Y, y_buf_offset, y_sz }, | ||||||
|               vk_subbuffer{ d_D, d_buf_offset, d_sz }, |               vk_subbuffer{ d_D, d_buf_offset, d_sz }, | ||||||
|               vk_subbuffer{ d_A, a_buf_offset, VK_WHOLE_SIZE }, |               ggml_vk_subbuffer(ctx, d_A, a_buf_offset), | ||||||
|             }, pc, elements); |             }, pc, elements); | ||||||
|     } else if (op == GGML_OP_GLU) { |     } else if (op == GGML_OP_GLU) { | ||||||
|         // Empty src1 is possible in glu, but the shader needs a buffer |         // Empty src1 is possible in glu, but the shader needs a buffer | ||||||
| @@ -8817,18 +8813,18 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, | |||||||
|     static_assert(MAX_PARAMETER_COUNT == 12); |     static_assert(MAX_PARAMETER_COUNT == 12); | ||||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, |     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, | ||||||
|         { |         { | ||||||
|             vk_subbuffer{ buf[0], offset[0], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[0], offset[0]), | ||||||
|             vk_subbuffer{ buf[1], offset[1], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[1], offset[1]), | ||||||
|             vk_subbuffer{ buf[2], offset[2], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[2], offset[2]), | ||||||
|             vk_subbuffer{ buf[3], offset[3], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[3], offset[3]), | ||||||
|             vk_subbuffer{ buf[4], offset[4], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[4], offset[4]), | ||||||
|             vk_subbuffer{ buf[5], offset[5], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[5], offset[5]), | ||||||
|             vk_subbuffer{ buf[6], offset[6], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[6], offset[6]), | ||||||
|             vk_subbuffer{ buf[7], offset[7], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[7], offset[7]), | ||||||
|             vk_subbuffer{ buf[8], offset[8], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[8], offset[8]), | ||||||
|             vk_subbuffer{ buf[9], offset[9], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[9], offset[9]), | ||||||
|             vk_subbuffer{ buf[10], offset[10], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[10], offset[10]), | ||||||
|             vk_subbuffer{ buf[11], offset[11], VK_WHOLE_SIZE }, |             ggml_vk_subbuffer(ctx, buf[11], offset[11]), | ||||||
|         }, pc, elements); |         }, pc, elements); | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -10002,7 +9998,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t | |||||||
|     ggml_vk_ctx_begin(ctx->device, subctx); |     ggml_vk_ctx_begin(ctx->device, subctx); | ||||||
|     for (size_t i = 0; i < num_it; i++) { |     for (size_t i = 0; i < num_it; i++) { | ||||||
|         ggml_vk_matmul( |         ggml_vk_matmul( | ||||||
|             ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), |             ctx, subctx, p, ggml_vk_subbuffer(ctx, d_X), ggml_vk_subbuffer(ctx, d_Y), ggml_vk_subbuffer(ctx, d_D), ggml_vk_subbuffer(ctx, ctx->prealloc_split_k), | ||||||
|             m, n, k, |             m, n, k, | ||||||
|             k, k, m, k*m, k*n, m*n, |             k, k, m, k*m, k*n, m*n, | ||||||
|             split_k, batch, batch, batch, 1, 1, n |             split_k, batch, batch, batch, 1, 1, n | ||||||
| @@ -10313,7 +10309,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ | |||||||
| // | // | ||||||
| //     vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); | //     vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); | ||||||
| //     ggml_vk_ctx_begin(ctx->device, subctx); | //     ggml_vk_ctx_begin(ctx->device, subctx); | ||||||
| //     ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne); | //     ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(ctx, x_buf), ggml_vk_subbuffer(ctx, qx_buf), ne); | ||||||
| //     ggml_vk_ctx_end(subctx); | //     ggml_vk_ctx_end(subctx); | ||||||
| // | // | ||||||
| //     auto begin = std::chrono::high_resolution_clock::now(); | //     auto begin = std::chrono::high_resolution_clock::now(); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jeff Bolz
					Jeff Bolz