From 85aaf52b7ebf4e1e4cbe6ba33793a222b89356c7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 9 Sep 2025 11:45:09 +0300 Subject: [PATCH] metal : create only metal buffers, no wrapping of host memory ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 312 ++++++++++++++++++++----------- 1 file changed, 207 insertions(+), 105 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 75b26820f7..9dab47a987 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -44,9 +44,10 @@ static struct ggml_backend_device g_ggml_backend_metal_device; // note: assumes single GPU device - the default one // TODO: support multiple GPU devices static struct ggml_backend_metal_device_context { - id mtl_device; - int mtl_device_ref_count; - id mtl_library; + id mtl_device; + int mtl_device_ref_count; + id mtl_queue; + id mtl_library; NSLock * mtl_lock; @@ -68,6 +69,7 @@ static struct ggml_backend_metal_device_context { } g_ggml_ctx_dev_main = { /*.mtl_device =*/ nil, /*.mtl_device_ref_count =*/ 0, + /*.mtl_queue =*/ nil, /*.mtl_library =*/ nil, /*.mtl_lock =*/ nil, /*.has_simdgroup_reduction =*/ false, @@ -94,6 +96,9 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev ctx->mtl_device = MTLCreateSystemDefaultDevice(); if (ctx->mtl_device) { + ctx->mtl_queue = [ctx->mtl_device newCommandQueue]; + [ctx->mtl_queue retain]; + ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -161,6 +166,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_library = nil; } + if (ctx->mtl_queue) { + [ctx->mtl_queue release]; + ctx->mtl_queue = nil; + } + if (ctx->mtl_device) { [ctx->mtl_device release]; ctx->mtl_device = nil; @@ -1005,7 +1015,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); ctx->device = device; - ctx->queue = [device newCommandQueue]; + ctx->queue = ctx_dev->mtl_queue; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -1704,7 +1714,6 @@ struct ggml_backend_metal_buffer { struct ggml_backend_metal_buffer_context { void * all_data; size_t all_size; - bool owned; // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap int n_buffers; @@ -1712,6 +1721,9 @@ struct ggml_backend_metal_buffer_context { // optional MTLResidencySet id rset; + + id device; + id queue; }; // rset init @@ -1777,7 +1789,7 @@ static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer // -static id ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) { +static id ggml_metal_get_buffer(const struct ggml_tensor * t, size_t * offs) { //GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -5932,14 +5944,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) ggml_backend_metal_buffer_rset_free(ctx); - if (ctx->owned) { -#if TARGET_OS_OSX - vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size); -#else - free(ctx->all_data); -#endif - } - free(ctx); } @@ -5950,25 +5954,112 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { } static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { +#if 1 + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + @autoreleasepool { + id cmd_buf = [ctx->queue commandBuffer]; + id encoder = [cmd_buf blitCommandEncoder]; + [cmd_buf enqueue]; + + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); + + buf_dst_offset += offset; + + [encoder fillBuffer:buf_dst + range:NSMakeRange(buf_dst_offset, buf_dst_offset + size) + value:value]; + + [encoder endEncoding]; + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +#else memset((char *)tensor->data + offset, value, size); +#endif GGML_UNUSED(buffer); } static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +#if 1 + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + @autoreleasepool { + id cmd_buf = [ctx->queue commandBuffer]; + id encoder = [cmd_buf blitCommandEncoder]; + [cmd_buf enqueue]; + + // TODO: is this an extra copy? can we avoid it? + id buf_src = [ctx->device newBufferWithBytes:data + length:size + options:MTLResourceStorageModeShared]; + + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); + + buf_dst_offset += offset; + + [encoder copyFromBuffer:buf_src + sourceOffset:0 + toBuffer:buf_dst + destinationOffset:buf_dst_offset + size:size]; + + [encoder endEncoding]; + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +#else memcpy((char *)tensor->data + offset, data, size); +#endif GGML_UNUSED(buffer); } static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +#if 1 + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + @autoreleasepool { + id cmd_buf = [ctx->queue commandBuffer]; + id encoder = [cmd_buf blitCommandEncoder]; + [cmd_buf enqueue]; + + size_t buf_src_offset = 0; + id buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset); + + buf_src_offset += offset; + + id buf_dst = [ctx->device newBufferWithBytesNoCopy:data + length:size + options:MTLResourceStorageModeShared + deallocator:nil]; + + [encoder copyFromBuffer:buf_src + sourceOffset:buf_src_offset + toBuffer:buf_dst + destinationOffset:0 + size:size]; + + [encoder endEncoding]; + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } +#else memcpy(data, (const char *)tensor->data + offset, size); +#endif GGML_UNUSED(buffer); } static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { if (ggml_backend_buffer_is_host(src->buffer)) { + GGML_ASSERT(false && "TODO"); memcpy(dst->data, src->data, ggml_nbytes(src)); return true; } @@ -5980,7 +6071,22 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - memset(ctx->all_data, value, ctx->all_size); + @autoreleasepool { + id cmd_buf = [ctx->queue commandBuffer]; + id encoder = [cmd_buf blitCommandEncoder]; + [cmd_buf enqueue]; + + [encoder fillBuffer:ctx->buffers[0].metal + range:NSMakeRange(0, ctx->buffers[0].size) + value:value]; + + [encoder endEncoding]; + + [cmd_buf commit]; + [cmd_buf waitUntilCompleted]; + } + + //memset(ctx->all_data, value, ctx->all_size); } static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { @@ -6044,9 +6150,21 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba id device = ctx_dev->mtl_device; +#if 1 + // TODO: tmp hack + static void * p_base = (void *) 0x000000400ULL; + + ctx->all_data = p_base; + + p_base = (void *) ((uintptr_t) p_base + size_aligned); +#else ctx->all_data = ggml_metal_host_malloc(size_aligned); +#endif ctx->all_size = size_aligned; - ctx->owned = true; + + ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; + ctx->n_buffers = 1; if (ctx->all_data != NULL) { @@ -6055,10 +6173,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba ctx->buffers[0].metal = nil; if (size_aligned > 0) { +#if 1 + ctx->buffers[0].metal = [device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate]; +#else ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; +#endif } } @@ -6092,13 +6214,7 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty } static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - // TODO: not sure why, but without setting this to `false`, op offloading does not work correctly - // to reproduce, do the following: - // - // build with: cmake -DGGML_BLAS=OFF -DGGML_METAL=ON - // run: ./bin/llama-cli -m ggml-model-mxfp4.gguf -p "$(printf 'hello %.0s' {1..100})" --n-cpu-moe 10 - // - return false; + return true; GGML_UNUSED(buft); } @@ -6111,7 +6227,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_metal_buffer_type_is_host, + /* .is_host = */ NULL, }, /* .device = */ &g_ggml_backend_metal_device, /* .context = */ NULL, @@ -6130,7 +6246,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = { /* .iface = */ { /* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name, - /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, + /* .alloc_buffer = */ NULL, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -6199,10 +6315,6 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, st struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; - ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - - struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context; - @autoreleasepool { id device = ctx_dev->mtl_device; @@ -6211,46 +6323,39 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, st length:size options:MTLResourceStorageModeShared]; - size_t tensor_offset = (uintptr_t)tensor->data + offset; + size_t buf_dst_offset = 0; + id buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset); - // find which Metal buffer contains this tensor - we will copy into that buffer - for (int i = 0; i < buf_ctx->n_buffers; i++) { - if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data && - tensor_offset < (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) { - - const size_t buf_dst_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data; - - id buf_dst = buf_ctx->buffers[i].metal; - - // queue the copy operation into the queue of the Metal context - // this will be queued at the end, after any currently ongoing GPU operations - id cmd_buf = [ctx->queue commandBuffer]; - [cmd_buf enqueue]; - - id encoder = [cmd_buf blitCommandEncoder]; - - [encoder copyFromBuffer:buf_src - sourceOffset:0 - toBuffer:buf_dst - destinationOffset:buf_dst_offset - size:size]; - - [encoder endEncoding]; - [cmd_buf commit]; - - // do not wait here for completion - //[cmd_buf waitUntilCompleted]; - - // instead, remember a reference to the command buffer and wait for it later if needed - [ctx->cmd_bufs_ext addObject:cmd_buf]; - ctx->cmd_buf_ext_last = cmd_buf; - - [cmd_buf retain]; - return; - } + if (buf_dst == nil) { + GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); } - GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + buf_dst_offset += offset; + + // queue the copy operation into the queue of the Metal context + // this will be queued at the end, after any currently ongoing GPU operations + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf enqueue]; + + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:0 + toBuffer:buf_dst + destinationOffset:buf_dst_offset + size:size]; + + [encoder endEncoding]; + [cmd_buf commit]; + + // do not wait here for completion + //[cmd_buf waitUntilCompleted]; + + // instead, remember a reference to the command buffer and wait for it later if needed + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_ext_last = cmd_buf; + + [cmd_buf retain]; } } @@ -6258,10 +6363,6 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const st struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; - ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - - struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context; - @autoreleasepool { id device = ctx_dev->mtl_device; @@ -6270,41 +6371,39 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const st options:MTLResourceStorageModeShared deallocator:nil]; - const size_t tensor_offset = (uintptr_t)tensor->data + offset; + size_t buf_src_offset = 0; + id buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset); - // find which buffer contains this tensor data - for (int i = 0; i < buf_ctx->n_buffers; i++) { - if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data && - tensor_offset < (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) { - - const size_t buf_src_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data; - - id buf_src = buf_ctx->buffers[i].metal; - - id cmd_buf = [ctx->queue commandBuffer]; - [cmd_buf enqueue]; - - id encoder = [cmd_buf blitCommandEncoder]; - - [encoder copyFromBuffer:buf_src - sourceOffset:buf_src_offset - toBuffer:buf_dst - destinationOffset:0 - size:size]; - - [encoder endEncoding]; - [cmd_buf commit]; - //[cmd_buf waitUntilCompleted]; - - [ctx->cmd_bufs_ext addObject:cmd_buf]; - ctx->cmd_buf_ext_last = cmd_buf; - - [cmd_buf retain]; - return; - } + if (buf_src == nil) { + GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); } - GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name); + buf_src_offset += offset; + + // queue the copy operation into the queue of the Metal context + // this will be queued at the end, after any currently ongoing GPU operations + id cmd_buf = [ctx->queue commandBuffer]; + [cmd_buf enqueue]; + + id encoder = [cmd_buf blitCommandEncoder]; + + [encoder copyFromBuffer:buf_src + sourceOffset:buf_src_offset + toBuffer:buf_dst + destinationOffset:0 + size:size]; + + [encoder endEncoding]; + [cmd_buf commit]; + + // do not wait here for completion + //[cmd_buf waitUntilCompleted]; + + // instead, remember a reference to the command buffer and wait for it later if needed + [ctx->cmd_bufs_ext addObject:cmd_buf]; + ctx->cmd_buf_ext_last = cmd_buf; + + [cmd_buf retain]; } } @@ -6513,8 +6612,8 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct g props->type = ggml_backend_metal_device_get_type(dev); ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = (struct ggml_backend_dev_caps) { - /* .async = */ false, - /* .host_buffer = */ false, + /* .async = */ true, + /* .host_buffer = */ true, /* .buffer_from_host_ptr = */ true, /* .events = */ false, }; @@ -6554,7 +6653,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back ctx->all_data = ptr; ctx->all_size = size; - ctx->owned = false; + ctx->n_buffers = 0; const size_t size_page = sysconf(_SC_PAGESIZE); @@ -6577,6 +6676,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back id device = ctx_dev->mtl_device; + ctx->device = device; + ctx->queue = ctx_dev->mtl_queue; + // the buffer fits into the max buffer size allowed by the device if (size_aligned <= device.maxBufferLength) { ctx->buffers[ctx->n_buffers].data = ptr;