mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	vulkan: Hybrid waitForFences/getFenceStatus to reduce fence latency (#12630)
There seems to be a bubble waking up from waitForFences, which costs a few percent performance and also increased variance in performance. This change inserts an "almost_ready" fence when the graph is about 80% complete and we waitForFences for the almost_ready fence and then spin (with _mm_pauses) waiting for the final fence to be signaled.
This commit is contained in:
		| @@ -24,6 +24,28 @@ | |||||||
| #include <future> | #include <future> | ||||||
| #include <thread> | #include <thread> | ||||||
|  |  | ||||||
|  | #if defined(_MSC_VER) | ||||||
|  | # define NOMINMAX 1 | ||||||
|  | # include <windows.h> | ||||||
|  | # define YIELD() YieldProcessor() | ||||||
|  | #elif defined(__clang__) || defined(__GNUC__) | ||||||
|  | # if defined(__x86_64__) ||defined(__i386__) | ||||||
|  | #  include <immintrin.h> | ||||||
|  | #  define YIELD() _mm_pause() | ||||||
|  | # elif defined(__arm__) || defined(__aarch64__) | ||||||
|  | #  if defined(__clang__) | ||||||
|  | #   include <arm_acle.h> | ||||||
|  | #   define YIELD() __yield() | ||||||
|  | #  else | ||||||
|  | #   define YIELD() asm volatile("yield") | ||||||
|  | #  endif | ||||||
|  | # endif | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #if !defined(YIELD) | ||||||
|  | #define YIELD() | ||||||
|  | #endif | ||||||
|  |  | ||||||
| #include "ggml-impl.h" | #include "ggml-impl.h" | ||||||
| #include "ggml-backend-impl.h" | #include "ggml-backend-impl.h" | ||||||
|  |  | ||||||
| @@ -787,7 +809,8 @@ struct ggml_backend_vk_context { | |||||||
|     ggml_vk_garbage_collector gc; |     ggml_vk_garbage_collector gc; | ||||||
|     size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k; |     size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k; | ||||||
|     vk_buffer prealloc_x, prealloc_y, prealloc_split_k; |     vk_buffer prealloc_x, prealloc_y, prealloc_split_k; | ||||||
|     vk::Fence fence; |     vk::Fence fence, almost_ready_fence; | ||||||
|  |     bool almost_ready_fence_pending {}; | ||||||
|  |  | ||||||
|     vk_buffer buffer_pool[MAX_VK_BUFFERS]; |     vk_buffer buffer_pool[MAX_VK_BUFFERS]; | ||||||
|  |  | ||||||
| @@ -878,6 +901,39 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx | |||||||
|  |  | ||||||
| static void ggml_backend_vk_free(ggml_backend_t backend); | static void ggml_backend_vk_free(ggml_backend_t backend); | ||||||
|  |  | ||||||
|  | // Wait for ctx->fence to be signaled. | ||||||
|  | static void ggml_vk_wait_for_fence(ggml_backend_vk_context * ctx) { | ||||||
|  |     // Use waitForFences while most of the graph executes. Hopefully the CPU can sleep | ||||||
|  |     // during this wait. | ||||||
|  |     if (ctx->almost_ready_fence_pending) { | ||||||
|  |         VK_CHECK(ctx->device->device.waitForFences({ ctx->almost_ready_fence }, true, UINT64_MAX), "almost_ready_fence"); | ||||||
|  |         ctx->device->device.resetFences({ ctx->almost_ready_fence }); | ||||||
|  |         ctx->almost_ready_fence_pending = false; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Spin (w/pause) waiting for the graph to finish executing. | ||||||
|  |     vk::Result result; | ||||||
|  |     while ((result = ctx->device->device.getFenceStatus(ctx->fence)) != vk::Result::eSuccess) { | ||||||
|  |         if (result != vk::Result::eNotReady) { | ||||||
|  |             fprintf(stderr, "ggml_vulkan: error %s at %s:%d\n", to_string(result).c_str(), __FILE__, __LINE__); | ||||||
|  |             exit(1); | ||||||
|  |         } | ||||||
|  |         for (uint32_t i = 0; i < 100; ++i) { | ||||||
|  |             YIELD(); | ||||||
|  |             YIELD(); | ||||||
|  |             YIELD(); | ||||||
|  |             YIELD(); | ||||||
|  |             YIELD(); | ||||||
|  |             YIELD(); | ||||||
|  |             YIELD(); | ||||||
|  |             YIELD(); | ||||||
|  |             YIELD(); | ||||||
|  |             YIELD(); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     ctx->device->device.resetFences({ ctx->fence }); | ||||||
|  | } | ||||||
|  |  | ||||||
| // variables to track number of compiles in progress | // variables to track number of compiles in progress | ||||||
| static uint32_t compile_count = 0; | static uint32_t compile_count = 0; | ||||||
| static std::mutex compile_count_mutex; | static std::mutex compile_count_mutex; | ||||||
| @@ -3355,6 +3411,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { | |||||||
|     ctx->prealloc_size_split_k = 0; |     ctx->prealloc_size_split_k = 0; | ||||||
|  |  | ||||||
|     ctx->fence = ctx->device->device.createFence({}); |     ctx->fence = ctx->device->device.createFence({}); | ||||||
|  |     ctx->almost_ready_fence = ctx->device->device.createFence({}); | ||||||
|  |  | ||||||
| #ifdef GGML_VULKAN_CHECK_RESULTS | #ifdef GGML_VULKAN_CHECK_RESULTS | ||||||
|     const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); |     const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); | ||||||
| @@ -7959,11 +8016,11 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence); | static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready); | ||||||
|  |  | ||||||
| // Returns true if node has enqueued work into the queue, false otherwise | // Returns true if node has enqueued work into the queue, false otherwise | ||||||
| // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. | // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. | ||||||
| static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){ | static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){ | ||||||
|     if (ggml_is_empty(node) || !node->buffer) { |     if (ggml_is_empty(node) || !node->buffer) { | ||||||
|         return false; |         return false; | ||||||
|     } |     } | ||||||
| @@ -8335,7 +8392,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod | |||||||
|  |  | ||||||
|         ctx->compute_ctx.reset(); |         ctx->compute_ctx.reset(); | ||||||
|  |  | ||||||
|         bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false); |         bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false, almost_ready); | ||||||
|         if (!ok) { |         if (!ok) { | ||||||
|             if (node->op == GGML_OP_UNARY) { |             if (node->op == GGML_OP_UNARY) { | ||||||
|                 std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl; |                 std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl; | ||||||
| @@ -8349,7 +8406,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod | |||||||
|     return true; |     return true; | ||||||
| } | } | ||||||
|  |  | ||||||
| static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){ | static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) { | ||||||
|     ggml_backend_buffer * buf = nullptr; |     ggml_backend_buffer * buf = nullptr; | ||||||
|  |  | ||||||
|     switch (tensor->op) { |     switch (tensor->op) { | ||||||
| @@ -8452,12 +8509,15 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * | |||||||
|             memcpy(cpy.dst, cpy.src, cpy.n); |             memcpy(cpy.dst, cpy.src, cpy.n); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) { | ||||||
|  |             ggml_vk_submit(subctx, ctx->almost_ready_fence); | ||||||
|  |             ctx->almost_ready_fence_pending = true; | ||||||
|  |         } else { | ||||||
|             ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{}); |             ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{}); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         if (use_fence) { |         if (use_fence) { | ||||||
|             VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); |             ggml_vk_wait_for_fence(ctx); | ||||||
|  |  | ||||||
|             ctx->device->device.resetFences({ ctx->fence }); |  | ||||||
|         } |         } | ||||||
| #ifdef GGML_VULKAN_CHECK_RESULTS | #ifdef GGML_VULKAN_CHECK_RESULTS | ||||||
|         ggml_vk_check_results_1(tensor); |         ggml_vk_check_results_1(tensor); | ||||||
| @@ -8543,6 +8603,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { | |||||||
|     ctx->gc.events.clear(); |     ctx->gc.events.clear(); | ||||||
|  |  | ||||||
|     ctx->device->device.destroyFence(ctx->fence); |     ctx->device->device.destroyFence(ctx->fence); | ||||||
|  |     ctx->device->device.destroyFence(ctx->almost_ready_fence); | ||||||
| } | } | ||||||
|  |  | ||||||
| static int ggml_vk_get_device_count() { | static int ggml_vk_get_device_count() { | ||||||
| @@ -8889,8 +8950,7 @@ static void ggml_backend_vk_synchronize(ggml_backend_t backend) { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     ggml_vk_submit(transfer_ctx, ctx->fence); |     ggml_vk_submit(transfer_ctx, ctx->fence); | ||||||
|     VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); |     ggml_vk_wait_for_fence(ctx); | ||||||
|     ctx->device->device.resetFences({ ctx->fence }); |  | ||||||
|  |  | ||||||
|     for (auto& cpy : transfer_ctx->out_memcpys) { |     for (auto& cpy : transfer_ctx->out_memcpys) { | ||||||
|         memcpy(cpy.dst, cpy.src, cpy.n); |         memcpy(cpy.dst, cpy.src, cpy.n); | ||||||
| @@ -8909,7 +8969,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg | |||||||
|  |  | ||||||
|     uint64_t total_mat_mul_bytes = 0; |     uint64_t total_mat_mul_bytes = 0; | ||||||
|     for (int i = 0; i < cgraph->n_nodes; i++) { |     for (int i = 0; i < cgraph->n_nodes; i++) { | ||||||
|         ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false); |         ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false); | ||||||
|         if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { |         if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { | ||||||
|             total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); |             total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); | ||||||
|         } |         } | ||||||
| @@ -8951,11 +9011,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg | |||||||
|             mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); |             mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining) | ||||||
|  |         bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5; | ||||||
|         bool submit = (submitted_nodes >= nodes_per_submit) || |         bool submit = (submitted_nodes >= nodes_per_submit) || | ||||||
|                       (mul_mat_bytes >= mul_mat_bytes_per_submit) || |                       (mul_mat_bytes >= mul_mat_bytes_per_submit) || | ||||||
|                       (i == last_node); |                       (i == last_node) || | ||||||
|  |                       (almost_ready && !ctx->almost_ready_fence_pending); | ||||||
|  |  | ||||||
|         bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit); |         bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit); | ||||||
|  |  | ||||||
|         if (enqueued) { |         if (enqueued) { | ||||||
|             ++submitted_nodes; |             ++submitted_nodes; | ||||||
| @@ -8967,7 +9030,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg | |||||||
| #endif | #endif | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         if (submit) { |         if (submit && enqueued) { | ||||||
|             first_node_in_batch = true; |             first_node_in_batch = true; | ||||||
|             submitted_nodes = 0; |             submitted_nodes = 0; | ||||||
|             mul_mat_bytes = 0; |             mul_mat_bytes = 0; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jeff Bolz
					Jeff Bolz