metal : try to utilize more of the shared memory using smaller views

2025-10-29 08:41:22 +00:00 · 2023-06-26 22:23:04 +03:00
parent c824d2e368
commit 5cc672a9a5
2 changed files with 6 additions and 4 deletions
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -23,7 +23,7 @@
 #include <stdbool.h>
 // max memory buffers that can be mapped to the device
-#define GGML_METAL_MAX_BUFFERS 16
+#define GGML_METAL_MAX_BUFFERS 256
 struct ggml_tensor;
 struct ggml_cgraph;
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -262,8 +262,10 @@ bool ggml_metal_add_buffer(
            size_aligned += (size_page - (size_aligned % size_page));
        }
        const size_t max_buffer_length = ctx->device.maxBufferLength/4;
        // the buffer fits into the max buffer size allowed by the device
-        if (size_aligned <= ctx->device.maxBufferLength) {
+        if (size_aligned <= max_buffer_length) {
            ctx->buffers[ctx->n_buffers].name = name;
            ctx->buffers[ctx->n_buffers].data = data;
            ctx->buffers[ctx->n_buffers].size = size;
@@ -282,8 +284,8 @@ bool ggml_metal_add_buffer(
            // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
            // one of the views
            const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
-            const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
+            const size_t size_step = max_buffer_length - size_ovlp;
-            const size_t size_view = ctx->device.maxBufferLength;
+            const size_t size_view = max_buffer_length;
            for (size_t i = 0; i < size; i += size_step) {
                const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);