From 3f62ee8bee97deae630c2e0b58b550bcf7fa69cf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 Sep 2025 17:06:46 +0300 Subject: [PATCH] metal : back to a single queue per device ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 51 ++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 4a67046c97..5cd8893eed 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -48,6 +48,8 @@ static struct ggml_backend_metal_device_context { int mtl_device_ref_count; id mtl_library; + id mtl_queue; + NSLock * mtl_lock; bool has_simdgroup_reduction; @@ -69,6 +71,7 @@ static struct ggml_backend_metal_device_context { /*.mtl_device =*/ nil, /*.mtl_device_ref_count =*/ 0, /*.mtl_library =*/ nil, + /*.mtl_queue =*/ nil, /*.mtl_lock =*/ nil, /*.has_simdgroup_reduction =*/ false, /*.has_simdgroup_mm =*/ false, @@ -94,6 +97,8 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev ctx->mtl_device = MTLCreateSystemDefaultDevice(); if (ctx->mtl_device) { + ctx->mtl_queue = [ctx->mtl_device newCommandQueue]; + ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -161,6 +166,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_library = nil; } + if (ctx->mtl_queue) { + [ctx->mtl_queue release]; + ctx->mtl_queue = nil; + } + if (ctx->mtl_device) { [ctx->mtl_device release]; ctx->mtl_device = nil; @@ -1003,7 +1013,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); ctx->device = device; - ctx->queue = [device newCommandQueue]; + ctx->queue = ctx_dev->mtl_queue; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -1670,8 +1680,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { Block_release(ctx->encode_async); - [ctx->queue release]; - for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { if (ctx->cmd_bufs[i].obj) { [ctx->cmd_bufs[i].obj release]; @@ -1709,6 +1717,7 @@ struct ggml_backend_metal_buffer_context { id rset; id device; + id queue; }; // rset init @@ -5776,6 +5785,8 @@ static enum ggml_status ggml_metal_graph_compute( dispatch_apply(n_cb, ctx->d_queue, ctx->encode_async); + [ctx->cmd_buf_last waitUntilScheduled]; + // for debugging: block until graph is computed //[ctx->cmd_buf_last waitUntilCompleted]; @@ -5871,7 +5882,7 @@ static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id queue = [ctx->device newCommandQueue]; + id queue = ctx->queue; id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; @@ -5888,11 +5899,7 @@ static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer [encoder endEncoding]; [cmd_buf commit]; - [cmd_buf waitUntilCompleted]; - - // note: not sure why this release is necessary as we are inside an autoreleasepool block - // but without it, we get "Context leak detected" warnings - [queue release]; + [cmd_buf waitUntilScheduled]; } #else memset((char *)tensor->data + offset, value, size); @@ -5906,15 +5913,16 @@ static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, s struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id queue = [ctx->device newCommandQueue]; + id queue = ctx->queue; id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; // TODO: is this an extra copy? can we avoid it? - id buf_src = [ctx->device newBufferWithBytes:data - length:size - options:MTLResourceStorageModeShared]; + id buf_src = [ctx->device newBufferWithBytesNoCopy:data + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; size_t buf_dst_offset = 0; id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); @@ -5929,10 +5937,9 @@ static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, s [encoder endEncoding]; + // note: no need to wait for completion here [cmd_buf commit]; - [cmd_buf waitUntilCompleted]; - - [queue release]; + [cmd_buf waitUntilScheduled]; } #else memcpy((char *)tensor->data + offset, data, size); @@ -5946,7 +5953,7 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id queue = [ctx->device newCommandQueue]; + id queue = ctx->queue; id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; @@ -5971,8 +5978,6 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c [cmd_buf commit]; [cmd_buf waitUntilCompleted]; - - [queue release]; } #else memcpy(data, (const char *)tensor->data + offset, size); @@ -5997,7 +6002,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; @autoreleasepool { - id queue = [ctx->device newCommandQueue]; + id queue = ctx->queue; id cmd_buf = [queue commandBuffer]; id encoder = [cmd_buf blitCommandEncoder]; [cmd_buf enqueue]; @@ -6009,9 +6014,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ [encoder endEncoding]; [cmd_buf commit]; - [cmd_buf waitUntilCompleted]; - - [queue release]; + [cmd_buf waitUntilScheduled]; } #else memset(ctx->all_data, value, ctx->all_size); @@ -6088,6 +6091,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba ctx->all_size = size_aligned; ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; ctx->n_buffers = 1; @@ -6606,6 +6610,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back id device = ctx_dev->mtl_device; ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) {