mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Optimize Vulkan backend for better CPU performance and less GPU synchronization overhead. (#8943)
* Optimize Vulkan backend for better CPU performance and less GPU synchronization overhead. - Allocation overhead for the temporary std::vectors was easily detectable with a sampling profiler and simple to remove. - ggml_vk_sync_buffer introduce a full pipeline sync which has a significant cost on the GPU side, sometimes larger than the actual kernel execution. Adding only barriers for shader read/writes and transfers seems to be sufficient looking at the code which either launches compute kernels or copies tensors. * Fix small typo --------- Co-authored-by: 0cc4m <picard12@live.de>
This commit is contained in:
		| @@ -268,6 +268,10 @@ struct vk_subbuffer { | ||||
|     vk_buffer buffer; | ||||
|     uint64_t offset; | ||||
|     uint64_t size; | ||||
|  | ||||
|     operator vk::DescriptorBufferInfo() const { | ||||
|         return { buffer->buffer, offset, size }; | ||||
|     } | ||||
| }; | ||||
|  | ||||
| struct vk_semaphore { | ||||
| @@ -1063,13 +1067,14 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { | ||||
|  | ||||
| static void ggml_vk_sync_buffers(vk_context& ctx) { | ||||
|     VK_LOG_DEBUG("ggml_vk_sync_buffers()"); | ||||
|     const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } }; | ||||
|  | ||||
|     ctx->s->buffer.pipelineBarrier( | ||||
|         ctx->q->stage_flags, | ||||
|         ctx->q->stage_flags, | ||||
|         {}, | ||||
|         mem_barriers, | ||||
|         { { | ||||
|           {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite}, | ||||
|           {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite} | ||||
|         } }, | ||||
|         {}, | ||||
|         {} | ||||
|     ); | ||||
| @@ -2420,28 +2425,23 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo | ||||
|     return s; | ||||
| } | ||||
|  | ||||
| static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) { | ||||
|  | ||||
|  | ||||
| static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) { | ||||
|     const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); | ||||
|     const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); | ||||
|     const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); | ||||
|     VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {"; | ||||
|     for (auto& buffer : buffers) { | ||||
|         std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), "; | ||||
|     for (auto& buffer : descriptor_buffer_infos) { | ||||
|         std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), "; | ||||
|     } | ||||
|     std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))"); | ||||
|     std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos; | ||||
|     std::vector<vk::WriteDescriptorSet> write_descriptor_sets; | ||||
|     GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size()); | ||||
|     GGML_ASSERT(buffers.size() == pipeline->parameter_count); | ||||
|     vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; | ||||
|     for (uint32_t i = 0; i < pipeline->parameter_count; i++) { | ||||
|         descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size}); | ||||
|     } | ||||
|     for (uint32_t i = 0; i < pipeline->parameter_count; i++) { | ||||
|         write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); | ||||
|     } | ||||
|     GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count); | ||||
|  | ||||
|     ctx->device->device.updateDescriptorSets(write_descriptor_sets, {}); | ||||
|     vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; | ||||
|     vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() }; | ||||
|     ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {}); | ||||
|  | ||||
|     subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); | ||||
|     subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); | ||||
| @@ -3123,7 +3123,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub | ||||
|     } else if (qx_needs_dequant) { | ||||
|         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; | ||||
|         ggml_vk_sync_buffers(subctx); | ||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); | ||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); | ||||
|     } | ||||
|     if (y_non_contig) { | ||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||
| @@ -3312,7 +3312,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& | ||||
|     }; | ||||
|     ggml_vk_sync_buffers(subctx); | ||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, | ||||
|                               { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, | ||||
|                               { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} }, | ||||
|                               sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); | ||||
| } | ||||
|  | ||||
| @@ -3384,7 +3384,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c | ||||
|     // compute | ||||
|     const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; | ||||
|     ggml_vk_sync_buffers(subctx); | ||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); | ||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); | ||||
| } | ||||
|  | ||||
| static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||
| @@ -3459,7 +3459,8 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con | ||||
|     // compute | ||||
|     const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; | ||||
|     ggml_vk_sync_buffers(subctx); | ||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); | ||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, | ||||
|         { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); | ||||
| } | ||||
|  | ||||
| static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||
| @@ -3634,7 +3635,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& | ||||
|     } else if (qx_needs_dequant) { | ||||
|         const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; | ||||
|         ggml_vk_sync_buffers(subctx); | ||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); | ||||
|         ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, | ||||
|             { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); | ||||
|     } | ||||
|     if (y_non_contig) { | ||||
|         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); | ||||
| @@ -3834,7 +3836,8 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte | ||||
|     }; | ||||
|     ggml_vk_sync_buffers(subctx); | ||||
|     ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, | ||||
|         { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } }, | ||||
|         { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, | ||||
|         vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } }, | ||||
|         sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z }); | ||||
| } | ||||
|  | ||||
| @@ -4381,7 +4384,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | ||||
|             } | ||||
|  | ||||
|             ggml_vk_sync_buffers(subctx); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|         } else if (op == GGML_OP_ROPE) { | ||||
|             // Empty src2 is possible in rope, but the shader needs a buffer | ||||
|             vk_subbuffer subbuf_z; | ||||
| @@ -4392,20 +4395,20 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | ||||
|             } | ||||
|  | ||||
|             ggml_vk_sync_buffers(subctx); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|         } else if (op == GGML_OP_IM2COL) { | ||||
|             // im2col uses only src1 and dst buffers | ||||
|             ggml_vk_sync_buffers(subctx); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|         } else if (use_src2) { | ||||
|             ggml_vk_sync_buffers(subctx); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|         } else if (use_src1) { | ||||
|             ggml_vk_sync_buffers(subctx); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|         } else { | ||||
|             ggml_vk_sync_buffers(subctx); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|             ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|         } | ||||
|     } else { | ||||
|         GGML_ASSERT(op != GGML_OP_SOFT_MAX); | ||||
| @@ -4442,10 +4445,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co | ||||
|  | ||||
|                 if (use_src1) { | ||||
|                     ggml_vk_sync_buffers(subctx); | ||||
|                     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|                     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset + y_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|                 } else { | ||||
|                     ggml_vk_sync_buffers(subctx); | ||||
|                     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|                     ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Markus Tavenrath
					Markus Tavenrath