mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Overlap cmdbuffer creation and cmdbuffer execution in Vulkan backend by submitting smaller cmdbuffers early. (#9118)
* Overlap cmdbuffer creation and cmdbuffer execution in Vulkan backend by submitting smaller cmdbuffers early. * fix compile issues * Fix issues where the last submit wasn't executed or handled properly. * remove trailing whitespace * Repair GGML_VULKAN_CHECK_RESULTS * Increase submit counter only if actual work has been submitted and increase submit count to 100. * Fix some nodes are not checked with GGML_VULKAN_CHECK_RESULTS enabled.
This commit is contained in:
		| @@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s | |||||||
|  |  | ||||||
| static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { | static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { | ||||||
|     if (ctx->seqs.empty()) { |     if (ctx->seqs.empty()) { | ||||||
|  |         if (fence) { | ||||||
|  |             ctx->q->queue.submit({}, fence); | ||||||
|  |         } | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|     VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")"); |     VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")"); | ||||||
| @@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){ | static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence); | ||||||
|  |  | ||||||
|  | // Returns true if node has enqueued work into the queue, false otherwise | ||||||
|  | // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. | ||||||
|  | static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){ | ||||||
|     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; |     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; | ||||||
|  |  | ||||||
|     if (ggml_is_empty(node) || extra == nullptr) { |     if (ggml_is_empty(node) || extra == nullptr) { | ||||||
|         return; |         return false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")"); |     VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")"); | ||||||
| @@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod | |||||||
|     case GGML_OP_PERMUTE: |     case GGML_OP_PERMUTE: | ||||||
|     case GGML_OP_TRANSPOSE: |     case GGML_OP_TRANSPOSE: | ||||||
|     case GGML_OP_NONE: |     case GGML_OP_NONE: | ||||||
|         return; |         return false; | ||||||
|     case GGML_OP_UNARY: |     case GGML_OP_UNARY: | ||||||
|         switch (ggml_get_unary_op(node)) { |         switch (ggml_get_unary_op(node)) { | ||||||
|         case GGML_UNARY_OP_SILU: |         case GGML_UNARY_OP_SILU: | ||||||
| @@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod | |||||||
|         case GGML_UNARY_OP_TANH: |         case GGML_UNARY_OP_TANH: | ||||||
|             break; |             break; | ||||||
|         default: |         default: | ||||||
|             return; |             return false; | ||||||
|         } |         } | ||||||
|         break; |         break; | ||||||
|     case GGML_OP_REPEAT: |     case GGML_OP_REPEAT: | ||||||
| @@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod | |||||||
|     default: |     default: | ||||||
|         std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; |         std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; | ||||||
|         GGML_ABORT("fatal error"); |         GGML_ABORT("fatal error"); | ||||||
|         return; |         return false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     vk_context compute_ctx; |     vk_context compute_ctx; | ||||||
| @@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod | |||||||
|             ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun); |             ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun); | ||||||
|             break; |             break; | ||||||
|         default: |         default: | ||||||
|             return; |             return false; | ||||||
|         } |         } | ||||||
|         break; |         break; | ||||||
|     case GGML_OP_DIAG_MASK_INF: |     case GGML_OP_DIAG_MASK_INF: | ||||||
| @@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod | |||||||
|  |  | ||||||
|         break; |         break; | ||||||
|     default: |     default: | ||||||
|         return; |         return false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (dryrun) { |     if (dryrun) { | ||||||
|         return; |         return false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     ctx->tensor_ctxs[node_idx] = compute_ctx; |     ctx->tensor_ctxs[node_idx] = compute_ctx; | ||||||
| @@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod | |||||||
|     last_node = true; |     last_node = true; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|     if (last_node) { |     if (submit || last_node) { | ||||||
|         ggml_vk_ctx_end(compute_ctx); |         ggml_vk_ctx_end(compute_ctx); | ||||||
|         compute_ctx->exit_tensor_idx = node_idx; |  | ||||||
|  |         // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward | ||||||
|  |         if (last_node) { | ||||||
|  |             compute_ctx->exit_tensor_idx = node_idx_begin; | ||||||
|  |         } | ||||||
|  |         else { | ||||||
|  |             compute_ctx->exit_tensor_idx = -1; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         ctx->compute_ctx.reset(); |         ctx->compute_ctx.reset(); | ||||||
|  |  | ||||||
|  |         bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false); | ||||||
|  |         if (!ok) { | ||||||
|  |             if (node->op == GGML_OP_UNARY) { | ||||||
|  |                 std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl; | ||||||
|  |             } | ||||||
|  |             else { | ||||||
|  |                 std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|     } |     } | ||||||
|  |     return true; | ||||||
| } | } | ||||||
|  |  | ||||||
| static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){ | static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){ | ||||||
|     ggml_tensor_extra_gpu * extra = nullptr; |     ggml_tensor_extra_gpu * extra = nullptr; | ||||||
|  |  | ||||||
|     switch (tensor->op) { |     switch (tensor->op) { | ||||||
| @@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * | |||||||
|  |  | ||||||
|     VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); |     VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); | ||||||
|  |  | ||||||
| #ifdef GGML_VULKAN_CHECK_RESULTS |  | ||||||
|     ggml_vk_check_results_0(tensor); |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|     vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock(); |     vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock(); | ||||||
|  |  | ||||||
| #ifdef GGML_VULKAN_PERF |     // always wait for the GPU work to be done for the last submit | ||||||
|     std::chrono::steady_clock::time_point start; |     if (tensor_idx == subctx->exit_tensor_idx) { | ||||||
| #endif // GGML_VULKAN_PERF |         use_fence = true; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     // Only run if ctx hasn't been submitted yet |     // Only run if ctx hasn't been submitted yet | ||||||
|     if (!subctx->seqs.empty()) { |     if (!subctx->seqs.empty()) { | ||||||
|  | #ifdef GGML_VULKAN_CHECK_RESULTS | ||||||
|  |         ggml_vk_check_results_0(tensor); | ||||||
|  |         use_fence = true; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|         // Do staging buffer copies |         // Do staging buffer copies | ||||||
|         for (auto& cpy : subctx->in_memcpys) { |         for (auto& cpy : subctx->in_memcpys) { | ||||||
|             memcpy(cpy.dst, cpy.src, cpy.n); |             memcpy(cpy.dst, cpy.src, cpy.n); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| #ifdef GGML_VULKAN_PERF |         ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{}); | ||||||
|         start = std::chrono::steady_clock::now(); |  | ||||||
| #endif // GGML_VULKAN_PERF |  | ||||||
|  |  | ||||||
|         ggml_vk_submit(subctx, ctx->fence); |         if (use_fence) { | ||||||
|  |             VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); | ||||||
|  |  | ||||||
|  |             ctx->device->device.resetFences({ ctx->fence }); | ||||||
|  |         } | ||||||
|  | #ifdef GGML_VULKAN_CHECK_RESULTS | ||||||
|  |         ggml_vk_check_results_1(tensor); | ||||||
|  | #endif | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if (tensor_idx == subctx->exit_tensor_idx) { |     if (tensor_idx == subctx->exit_tensor_idx) { | ||||||
|         VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); |  | ||||||
|  |  | ||||||
| #ifdef GGML_VULKAN_PERF |  | ||||||
|         auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start); |  | ||||||
|         ctx->device->perf_logger->log_timing(tensor, duration.count()); |  | ||||||
| #endif // GGML_VULKAN_PERF |  | ||||||
|  |  | ||||||
|         ctx->device->device.resetFences({ ctx->fence }); |  | ||||||
|  |  | ||||||
|         // Do staging buffer copies |         // Do staging buffer copies | ||||||
|         for (auto& cpy : subctx->out_memcpys) { |         for (auto& cpy : subctx->out_memcpys) { | ||||||
|             memcpy(cpy.dst, cpy.src, cpy.n); |             memcpy(cpy.dst, cpy.src, cpy.n); | ||||||
| @@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen | |||||||
|     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; |     ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; | ||||||
|  |  | ||||||
|     for (int i = 0; i < cgraph->n_nodes; i++) { |     for (int i = 0; i < cgraph->n_nodes; i++) { | ||||||
|         ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true); |         ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false); | ||||||
|     } |     } | ||||||
|     ggml_vk_preallocate_buffers(ctx); |     ggml_vk_preallocate_buffers(ctx); | ||||||
|     ggml_pipeline_allocate_descriptor_sets(ctx->device); |     ggml_pipeline_allocate_descriptor_sets(ctx->device); | ||||||
| @@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen | |||||||
|     // Reserve tensor context space for all nodes |     // Reserve tensor context space for all nodes | ||||||
|     ctx->tensor_ctxs.resize(cgraph->n_nodes); |     ctx->tensor_ctxs.resize(cgraph->n_nodes); | ||||||
|  |  | ||||||
|     for (int i = 0; i < cgraph->n_nodes; i++) { |     bool first_node_in_batch = true; // true if next node will be first node in a batch | ||||||
|         ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false); |     int submit_node_idx = 0; // index to first node in a batch | ||||||
|     } |  | ||||||
|  |  | ||||||
|  |     // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution | ||||||
|  |     constexpr int submit_count = 100; | ||||||
|  |     int submitted_nodes = 0; | ||||||
|     for (int i = 0; i < cgraph->n_nodes; i++) { |     for (int i = 0; i < cgraph->n_nodes; i++) { | ||||||
|         ggml_tensor * node = cgraph->nodes[i]; |         if (first_node_in_batch) { | ||||||
|  |             submit_node_idx = i; | ||||||
|         if (ggml_vk_is_empty(node)) { |  | ||||||
|             continue; |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         bool ok = ggml_vk_compute_forward(ctx, node, i); |         bool submit = (submitted_nodes >= submit_count) || (i == last_node); | ||||||
|         if (!ok) { |  | ||||||
|             if (node->op == GGML_OP_UNARY) { |  | ||||||
|                 std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl; |         bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit); | ||||||
|             } else { |  | ||||||
|                 std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl; |         if (enqueued) { | ||||||
|  |             ++submitted_nodes; | ||||||
|  |  | ||||||
|  | #ifndef GGML_VULKAN_CHECK_RESULTS | ||||||
|  |             if (first_node_in_batch) { | ||||||
|  |                 first_node_in_batch = false; | ||||||
|             } |             } | ||||||
|         } |  | ||||||
| #ifdef GGML_VULKAN_CHECK_RESULTS |  | ||||||
|         else { |  | ||||||
|             ggml_vk_check_results_1(node); |  | ||||||
|         } |  | ||||||
| #endif | #endif | ||||||
|         GGML_ASSERT(ok); |         } | ||||||
|  |  | ||||||
|  |         if (submit) { | ||||||
|  |             first_node_in_batch = true; | ||||||
|  |             submitted_nodes = 0; | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| #ifdef GGML_VULKAN_PERF | #ifdef GGML_VULKAN_PERF | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Markus Tavenrath
					Markus Tavenrath