metal : create only metal buffers, no wrapping of host memory

ggml-ci
2025-11-19 11:57:07 +00:00 · 2025-09-09 11:45:09 +03:00
parent d91ba85d04
commit 85aaf52b7e
1 changed files with 207 additions and 105 deletions
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -44,9 +44,10 @@ static struct ggml_backend_device g_ggml_backend_metal_device;
 // note: assumes single GPU device - the default one
 // TODO: support multiple GPU devices
 static struct ggml_backend_metal_device_context {
-    id<MTLDevice>  mtl_device;
+    id<MTLDevice>       mtl_device;
-    int            mtl_device_ref_count;
+    int                 mtl_device_ref_count;
-    id<MTLLibrary> mtl_library;
+    id<MTLCommandQueue> mtl_queue;
    id<MTLLibrary>      mtl_library;
    NSLock * mtl_lock;
@@ -68,6 +69,7 @@ static struct ggml_backend_metal_device_context {
 } g_ggml_ctx_dev_main = {
    /*.mtl_device              =*/ nil,
    /*.mtl_device_ref_count    =*/ 0,
    /*.mtl_queue               =*/ nil,
    /*.mtl_library             =*/ nil,
    /*.mtl_lock                =*/ nil,
    /*.has_simdgroup_reduction =*/ false,
@@ -94,6 +96,9 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
        ctx->mtl_device = MTLCreateSystemDefaultDevice();
        if (ctx->mtl_device) {
            ctx->mtl_queue = [ctx->mtl_device newCommandQueue];
            [ctx->mtl_queue retain];
            ctx->has_simdgroup_reduction  = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
            ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
@@ -161,6 +166,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
            ctx->mtl_library = nil;
        }
        if (ctx->mtl_queue) {
            [ctx->mtl_queue release];
            ctx->mtl_queue = nil;
        }
        if (ctx->mtl_device) {
            [ctx->mtl_device release];
            ctx->mtl_device = nil;
@@ -1005,7 +1015,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
    ctx->device = device;
-    ctx->queue = [device newCommandQueue];
+    ctx->queue = ctx_dev->mtl_queue;
    if (ctx->queue == nil) {
        GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
        return NULL;
@@ -1704,7 +1714,6 @@ struct ggml_backend_metal_buffer {
 struct ggml_backend_metal_buffer_context {
    void * all_data;
    size_t all_size;
    bool owned;
    // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
    int n_buffers;
@@ -1712,6 +1721,9 @@ struct ggml_backend_metal_buffer_context {
    // optional MTLResidencySet
    id rset;
    id device;
    id queue;
 };
 // rset init
@@ -1777,7 +1789,7 @@ static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
 //
-static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
+static id<MTLBuffer> ggml_metal_get_buffer(const struct ggml_tensor * t, size_t * offs) {
    //GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
    const int64_t tsize = ggml_nbytes(t);
@@ -5932,14 +5944,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
    ggml_backend_metal_buffer_rset_free(ctx);
    if (ctx->owned) {
 #if TARGET_OS_OSX
        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
 #else
        free(ctx->all_data);
 #endif
    }
    free(ctx);
 }
@@ -5950,25 +5954,112 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
 }
 static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
 #if 1
    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
    @autoreleasepool {
        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
        [cmd_buf enqueue];
        size_t buf_dst_offset = 0;
        id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
        buf_dst_offset += offset;
        [encoder fillBuffer:buf_dst
                      range:NSMakeRange(buf_dst_offset, buf_dst_offset + size)
                      value:value];
        [encoder endEncoding];
        [cmd_buf commit];
        [cmd_buf waitUntilCompleted];
    }
 #else
    memset((char *)tensor->data + offset, value, size);
 #endif
    GGML_UNUSED(buffer);
 }
 static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 #if 1
    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
    @autoreleasepool {
        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
        [cmd_buf enqueue];
        // TODO: is this an extra copy? can we avoid it?
        id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
                                                         length:size
                                                        options:MTLResourceStorageModeShared];
        size_t buf_dst_offset = 0;
        id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
        buf_dst_offset += offset;
        [encoder copyFromBuffer:buf_src
                   sourceOffset:0
                       toBuffer:buf_dst
              destinationOffset:buf_dst_offset
                           size:size];
        [encoder endEncoding];
        [cmd_buf commit];
        [cmd_buf waitUntilCompleted];
    }
 #else
    memcpy((char *)tensor->data + offset, data, size);
 #endif
    GGML_UNUSED(buffer);
 }
 static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 #if 1
    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
    @autoreleasepool {
        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
        [cmd_buf enqueue];
        size_t buf_src_offset = 0;
        id<MTLBuffer> buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset);
        buf_src_offset += offset;
        id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
                                                               length:size
                                                              options:MTLResourceStorageModeShared
                                                          deallocator:nil];
        [encoder copyFromBuffer:buf_src
                   sourceOffset:buf_src_offset
                       toBuffer:buf_dst
              destinationOffset:0
                           size:size];
        [encoder endEncoding];
        [cmd_buf commit];
        [cmd_buf waitUntilCompleted];
    }
 #else
    memcpy(data, (const char *)tensor->data + offset, size);
 #endif
    GGML_UNUSED(buffer);
 }
 static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
    if (ggml_backend_buffer_is_host(src->buffer)) {
        GGML_ASSERT(false && "TODO");
        memcpy(dst->data, src->data, ggml_nbytes(src));
        return true;
    }
@@ -5980,7 +6071,22 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
 static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
-    memset(ctx->all_data, value, ctx->all_size);
+    @autoreleasepool {
        id<MTLCommandBuffer>      cmd_buf = [ctx->queue commandBuffer];
        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
        [cmd_buf enqueue];
        [encoder fillBuffer:ctx->buffers[0].metal
                          range:NSMakeRange(0, ctx->buffers[0].size)
                          value:value];
        [encoder endEncoding];
        [cmd_buf commit];
        [cmd_buf waitUntilCompleted];
    }
    //memset(ctx->all_data, value, ctx->all_size);
 }
 static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
@@ -6044,9 +6150,21 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
    id<MTLDevice> device = ctx_dev->mtl_device;
 #if 1
    // TODO: tmp hack
    static void * p_base = (void *) 0x000000400ULL;
    ctx->all_data = p_base;
    p_base = (void *) ((uintptr_t) p_base + size_aligned);
 #else
    ctx->all_data = ggml_metal_host_malloc(size_aligned);
 #endif
    ctx->all_size = size_aligned;
-    ctx->owned = true;
+
    ctx->device = device;
    ctx->queue = ctx_dev->mtl_queue;
    ctx->n_buffers = 1;
    if (ctx->all_data != NULL) {
@@ -6055,10 +6173,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
        ctx->buffers[0].metal = nil;
        if (size_aligned > 0) {
 #if 1
            ctx->buffers[0].metal = [device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
 #else
            ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
                                            length:size_aligned
                                            options:MTLResourceStorageModeShared
                                            deallocator:nil];
 #endif
        }
    }
@@ -6092,13 +6214,7 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
 }
 static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
-    // TODO: not sure why, but without setting this to `false`, op offloading does not work correctly
+    return true;
    //       to reproduce, do the following:
    //
    //   build with: cmake -DGGML_BLAS=OFF -DGGML_METAL=ON
    //   run:        ./bin/llama-cli -m ggml-model-mxfp4.gguf -p "$(printf 'hello %.0s' {1..100})" --n-cpu-moe 10
    //
    return false;
    GGML_UNUSED(buft);
 }
@@ -6111,7 +6227,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
            /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
            /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
+            /* .is_host          = */ NULL,
        },
        /* .device  = */ &g_ggml_backend_metal_device,
        /* .context = */ NULL,
@@ -6130,7 +6246,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void)
    static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
        /* .iface = */ {
            /* .get_name         = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
-            /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_alloc_buffer,
+            /* .alloc_buffer     = */ NULL,
            /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
            /* .get_max_size     = */ ggml_backend_metal_buffer_type_get_max_size,
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
@@ -6199,10 +6315,6 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend,       st
    struct ggml_backend_metal_context        * ctx     = backend->context;
    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context;
    @autoreleasepool {
        id<MTLDevice> device = ctx_dev->mtl_device;
@@ -6211,46 +6323,39 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend,       st
                                                    length:size
                                                   options:MTLResourceStorageModeShared];
-        size_t tensor_offset = (uintptr_t)tensor->data + offset;
+        size_t buf_dst_offset = 0;
        id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
-        // find which Metal buffer contains this tensor - we will copy into that buffer
+        if (buf_dst == nil) {
-        for (int i = 0; i < buf_ctx->n_buffers; i++) {
+            GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
            if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data &&
                tensor_offset <  (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) {
                const size_t buf_dst_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data;
                id<MTLBuffer> buf_dst = buf_ctx->buffers[i].metal;
                // queue the copy operation into the queue of the Metal context
                // this will be queued at the end, after any currently ongoing GPU operations
                id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
                [cmd_buf enqueue];
                id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
                [encoder copyFromBuffer:buf_src
                           sourceOffset:0
                               toBuffer:buf_dst
                      destinationOffset:buf_dst_offset
                           size:size];
                [encoder endEncoding];
                [cmd_buf commit];
                // do not wait here for completion
              //[cmd_buf waitUntilCompleted];
                // instead, remember a reference to the command buffer and wait for it later if needed
                [ctx->cmd_bufs_ext addObject:cmd_buf];
                ctx->cmd_buf_ext_last = cmd_buf;
                [cmd_buf retain];
                return;
            }
        }
-        GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
+        buf_dst_offset += offset;
        // queue the copy operation into the queue of the Metal context
        // this will be queued at the end, after any currently ongoing GPU operations
        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
        [cmd_buf enqueue];
        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
        [encoder copyFromBuffer:buf_src
                   sourceOffset:0
                       toBuffer:buf_dst
              destinationOffset:buf_dst_offset
                           size:size];
        [encoder endEncoding];
        [cmd_buf commit];
        // do not wait here for completion
        //[cmd_buf waitUntilCompleted];
        // instead, remember a reference to the command buffer and wait for it later if needed
        [ctx->cmd_bufs_ext addObject:cmd_buf];
        ctx->cmd_buf_ext_last = cmd_buf;
        [cmd_buf retain];
    }
 }
@@ -6258,10 +6363,6 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const st
    struct ggml_backend_metal_context        * ctx     = backend->context;
    struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
    struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context;
    @autoreleasepool {
        id<MTLDevice> device = ctx_dev->mtl_device;
@@ -6270,41 +6371,39 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const st
                                                         options:MTLResourceStorageModeShared
                                                     deallocator:nil];
-        const size_t tensor_offset = (uintptr_t)tensor->data + offset;
+        size_t buf_src_offset = 0;
        id<MTLBuffer> buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset);
-        // find which buffer contains this tensor data
+        if (buf_src == nil) {
-        for (int i = 0; i < buf_ctx->n_buffers; i++) {
+            GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
            if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data &&
                tensor_offset <  (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) {
                const size_t buf_src_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data;
                id<MTLBuffer> buf_src = buf_ctx->buffers[i].metal;
                id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
                [cmd_buf enqueue];
                id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
                [encoder copyFromBuffer:buf_src
                           sourceOffset:buf_src_offset
                               toBuffer:buf_dst
                      destinationOffset:0
                                   size:size];
                [encoder endEncoding];
                [cmd_buf commit];
              //[cmd_buf waitUntilCompleted];
                [ctx->cmd_bufs_ext addObject:cmd_buf];
                ctx->cmd_buf_ext_last = cmd_buf;
                [cmd_buf retain];
                return;
            }
        }
-        GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
+        buf_src_offset += offset;
        // queue the copy operation into the queue of the Metal context
        // this will be queued at the end, after any currently ongoing GPU operations
        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
        [cmd_buf enqueue];
        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
        [encoder copyFromBuffer:buf_src
                   sourceOffset:buf_src_offset
                       toBuffer:buf_dst
              destinationOffset:0
                           size:size];
        [encoder endEncoding];
        [cmd_buf commit];
        // do not wait here for completion
        //[cmd_buf waitUntilCompleted];
        // instead, remember a reference to the command buffer and wait for it later if needed
        [ctx->cmd_bufs_ext addObject:cmd_buf];
        ctx->cmd_buf_ext_last = cmd_buf;
        [cmd_buf retain];
    }
 }
@@ -6513,8 +6612,8 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct g
    props->type        = ggml_backend_metal_device_get_type(dev);
    ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
    props->caps = (struct ggml_backend_dev_caps) {
-        /* .async                 = */ false,
+        /* .async                 = */ true,
-        /* .host_buffer           = */ false,
+        /* .host_buffer           = */ true,
        /* .buffer_from_host_ptr  = */ true,
        /* .events                = */ false,
    };
@@ -6554,7 +6653,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
    ctx->all_data = ptr;
    ctx->all_size = size;
-    ctx->owned = false;
+
    ctx->n_buffers = 0;
    const size_t size_page = sysconf(_SC_PAGESIZE);
@@ -6577,6 +6676,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
    id<MTLDevice> device = ctx_dev->mtl_device;
    ctx->device = device;
    ctx->queue = ctx_dev->mtl_queue;
    // the buffer fits into the max buffer size allowed by the device
    if (size_aligned <= device.maxBufferLength) {
        ctx->buffers[ctx->n_buffers].data  = ptr;