mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Optimize Vulkan backend for better CPU performance and less GPU synchronization overhead. (#8943)
* Optimize Vulkan backend for better CPU performance and less GPU synchronization overhead. - Allocation overhead for the temporary std::vectors was easily detectable with a sampling profiler and simple to remove. - ggml_vk_sync_buffer introduce a full pipeline sync which has a significant cost on the GPU side, sometimes larger than the actual kernel execution. Adding only barriers for shader read/writes and transfers seems to be sufficient looking at the code which either launches compute kernels or copies tensors. * Fix small typo --------- Co-authored-by: 0cc4m <picard12@live.de>
This commit is contained in:
		| @@ -268,6 +268,10 @@ struct vk_subbuffer { | |||||||
|     vk_buffer buffer; |     vk_buffer buffer; | ||||||
|     uint64_t offset; |     uint64_t offset; | ||||||
|     uint64_t size; |     uint64_t size; | ||||||
|  |  | ||||||
|  |     operator vk::DescriptorBufferInfo() const { | ||||||
|  |         return { buffer->buffer, offset, size }; | ||||||
|  |     } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| struct vk_semaphore { | struct vk_semaphore { | ||||||
| @@ -1063,13 +1067,14 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { | |||||||
|  |  | ||||||
| static void ggml_vk_sync_buffers(vk_context& ctx) { | static void ggml_vk_sync_buffers(vk_context& ctx) { | ||||||
|     VK_LOG_DEBUG("ggml_vk_sync_buffers()"); |     VK_LOG_DEBUG("ggml_vk_sync_buffers()"); | ||||||
|     const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } }; |  | ||||||
|  |  | ||||||
|     ctx->s->buffer.pipelineBarrier( |     ctx->s->buffer.pipelineBarrier( | ||||||
|         ctx->q->stage_flags, |         ctx->q->stage_flags, | ||||||
|         ctx->q->stage_flags, |         ctx->q->stage_flags, | ||||||
|         {}, |         {}, | ||||||
|         mem_barriers, |         { { | ||||||
|  |           {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite}, | ||||||
|  |           {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite} | ||||||
|  |         } }, | ||||||
|         {}, |         {}, | ||||||
|         {} |         {} | ||||||
|     ); |     ); | ||||||
| @@ -2420,28 +2425,23 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo | |||||||
|     return s; |     return s; | ||||||
| } | } | ||||||
|  |  | ||||||
| static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) { |  | ||||||
|  |  | ||||||
|  | static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) { | ||||||
|     const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); |     const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); | ||||||
|     const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); |     const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); | ||||||
|     const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); |     const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); | ||||||
|     VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; |     VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; | ||||||
|     for (auto& buffer : buffers) { |     for (auto& buffer : descriptor_buffer_infos) { | ||||||
|         std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), "; |         std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), "; | ||||||
|     } |     } | ||||||
|     std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); |     std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); | ||||||
|     std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos; |  | ||||||
|     std::vector<vk::WriteDescriptorSet> write_descriptor_sets; |  | ||||||
|     GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size()); |     GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size()); | ||||||
|     GGML_ASSERT(buffers.size() == pipeline->parameter_count); |     GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count); | ||||||
|     vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; |  | ||||||
|     for (uint32_t i = 0; i < pipeline->parameter_count; i++) { |  | ||||||
|         descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size}); |  | ||||||
|     } |  | ||||||
|     for (uint32_t i = 0; i < pipeline->parameter_count; i++) { |  | ||||||
|         write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     ctx->device->device.updateDescriptorSets(write_descriptor_sets, {}); |     vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; | ||||||
|  |     vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() }; | ||||||
|  |     ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {}); | ||||||
|  |  | ||||||
|     subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); |     subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); | ||||||
|     subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); |     subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); | ||||||
| @@ -3123,7 +3123,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | |||||||
|     } else if (qx_needs_dequant) { |     } else if (qx_needs_dequant) { | ||||||
|         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; |         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; | ||||||
|         ggml_vk_sync_buffers(subctx); |         ggml_vk_sync_buffers(subctx); | ||||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); |         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); | ||||||
|     } |     } | ||||||
|     if (y_non_contig) { |     if (y_non_contig) { | ||||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); |         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||||
| @@ -3312,7 +3312,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|     }; |     }; | ||||||
|     ggml_vk_sync_buffers(subctx); |     ggml_vk_sync_buffers(subctx); | ||||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, |     ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, | ||||||
|                               { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, |                               { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} }, | ||||||
|                               sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); |                               sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -3384,7 +3384,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c | |||||||
|     // compute |     // compute | ||||||
|     const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; |     const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; | ||||||
|     ggml_vk_sync_buffers(subctx); |     ggml_vk_sync_buffers(subctx); | ||||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); |     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); | ||||||
| } | } | ||||||
|  |  | ||||||
| static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
| @@ -3459,7 +3459,8 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con | |||||||
|     // compute |     // compute | ||||||
|     const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; |     const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; | ||||||
|     ggml_vk_sync_buffers(subctx); |     ggml_vk_sync_buffers(subctx); | ||||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); |     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, | ||||||
|  |         { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); | ||||||
| } | } | ||||||
|  |  | ||||||
| static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
| @@ -3634,7 +3635,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& | |||||||
|     } else if (qx_needs_dequant) { |     } else if (qx_needs_dequant) { | ||||||
|         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; |         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; | ||||||
|         ggml_vk_sync_buffers(subctx); |         ggml_vk_sync_buffers(subctx); | ||||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); |         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, | ||||||
|  |             { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); | ||||||
|     } |     } | ||||||
|     if (y_non_contig) { |     if (y_non_contig) { | ||||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); |         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||||
| @@ -3834,7 +3836,8 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte | |||||||
|     }; |     }; | ||||||
|     ggml_vk_sync_buffers(subctx); |     ggml_vk_sync_buffers(subctx); | ||||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, |     ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, | ||||||
|         { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } }, |         { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, | ||||||
|  |         vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } }, | ||||||
|         sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z }); |         sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z }); | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -4381,7 +4384,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             ggml_vk_sync_buffers(subctx); |             ggml_vk_sync_buffers(subctx); | ||||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); |             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||||
|         } else if (op == GGML_OP_ROPE) { |         } else if (op == GGML_OP_ROPE) { | ||||||
|             // Empty src2 is possible in rope, but the shader needs a buffer |             // Empty src2 is possible in rope, but the shader needs a buffer | ||||||
|             vk_subbuffer subbuf_z; |             vk_subbuffer subbuf_z; | ||||||
| @@ -4392,20 +4395,20 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             ggml_vk_sync_buffers(subctx); |             ggml_vk_sync_buffers(subctx); | ||||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); |             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||||
|         } else if (op == GGML_OP_IM2COL) { |         } else if (op == GGML_OP_IM2COL) { | ||||||
|             // im2col uses only src1 and dst buffers |             // im2col uses only src1 and dst buffers | ||||||
|             ggml_vk_sync_buffers(subctx); |             ggml_vk_sync_buffers(subctx); | ||||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); |             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||||
|         } else if (use_src2) { |         } else if (use_src2) { | ||||||
|             ggml_vk_sync_buffers(subctx); |             ggml_vk_sync_buffers(subctx); | ||||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); |             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||||
|         } else if (use_src1) { |         } else if (use_src1) { | ||||||
|             ggml_vk_sync_buffers(subctx); |             ggml_vk_sync_buffers(subctx); | ||||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); |             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||||
|         } else { |         } else { | ||||||
|             ggml_vk_sync_buffers(subctx); |             ggml_vk_sync_buffers(subctx); | ||||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); |             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||||
|         } |         } | ||||||
|     } else { |     } else { | ||||||
|         GGML_ASSERT(op != GGML_OP_SOFT_MAX); |         GGML_ASSERT(op != GGML_OP_SOFT_MAX); | ||||||
| @@ -4442,10 +4445,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | |||||||
|  |  | ||||||
|                 if (use_src1) { |                 if (use_src1) { | ||||||
|                     ggml_vk_sync_buffers(subctx); |                     ggml_vk_sync_buffers(subctx); | ||||||
|                     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); |                     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset + y_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||||
|                 } else { |                 } else { | ||||||
|                     ggml_vk_sync_buffers(subctx); |                     ggml_vk_sync_buffers(subctx); | ||||||
|                     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); |                     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Markus Tavenrath
					Markus Tavenrath