mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-19 11:57:07 +00:00
metal : create only metal buffers, no wrapping of host memory
ggml-ci
This commit is contained in:
@@ -44,9 +44,10 @@ static struct ggml_backend_device g_ggml_backend_metal_device;
|
|||||||
// note: assumes single GPU device - the default one
|
// note: assumes single GPU device - the default one
|
||||||
// TODO: support multiple GPU devices
|
// TODO: support multiple GPU devices
|
||||||
static struct ggml_backend_metal_device_context {
|
static struct ggml_backend_metal_device_context {
|
||||||
id<MTLDevice> mtl_device;
|
id<MTLDevice> mtl_device;
|
||||||
int mtl_device_ref_count;
|
int mtl_device_ref_count;
|
||||||
id<MTLLibrary> mtl_library;
|
id<MTLCommandQueue> mtl_queue;
|
||||||
|
id<MTLLibrary> mtl_library;
|
||||||
|
|
||||||
NSLock * mtl_lock;
|
NSLock * mtl_lock;
|
||||||
|
|
||||||
@@ -68,6 +69,7 @@ static struct ggml_backend_metal_device_context {
|
|||||||
} g_ggml_ctx_dev_main = {
|
} g_ggml_ctx_dev_main = {
|
||||||
/*.mtl_device =*/ nil,
|
/*.mtl_device =*/ nil,
|
||||||
/*.mtl_device_ref_count =*/ 0,
|
/*.mtl_device_ref_count =*/ 0,
|
||||||
|
/*.mtl_queue =*/ nil,
|
||||||
/*.mtl_library =*/ nil,
|
/*.mtl_library =*/ nil,
|
||||||
/*.mtl_lock =*/ nil,
|
/*.mtl_lock =*/ nil,
|
||||||
/*.has_simdgroup_reduction =*/ false,
|
/*.has_simdgroup_reduction =*/ false,
|
||||||
@@ -94,6 +96,9 @@ static id<MTLDevice> ggml_backend_metal_device_acq(struct ggml_backend_metal_dev
|
|||||||
ctx->mtl_device = MTLCreateSystemDefaultDevice();
|
ctx->mtl_device = MTLCreateSystemDefaultDevice();
|
||||||
|
|
||||||
if (ctx->mtl_device) {
|
if (ctx->mtl_device) {
|
||||||
|
ctx->mtl_queue = [ctx->mtl_device newCommandQueue];
|
||||||
|
[ctx->mtl_queue retain];
|
||||||
|
|
||||||
ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
|
||||||
ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
|
||||||
|
|
||||||
@@ -161,6 +166,11 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
|||||||
ctx->mtl_library = nil;
|
ctx->mtl_library = nil;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctx->mtl_queue) {
|
||||||
|
[ctx->mtl_queue release];
|
||||||
|
ctx->mtl_queue = nil;
|
||||||
|
}
|
||||||
|
|
||||||
if (ctx->mtl_device) {
|
if (ctx->mtl_device) {
|
||||||
[ctx->mtl_device release];
|
[ctx->mtl_device release];
|
||||||
ctx->mtl_device = nil;
|
ctx->mtl_device = nil;
|
||||||
@@ -1005,7 +1015,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
|
|||||||
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
||||||
|
|
||||||
ctx->device = device;
|
ctx->device = device;
|
||||||
ctx->queue = [device newCommandQueue];
|
ctx->queue = ctx_dev->mtl_queue;
|
||||||
if (ctx->queue == nil) {
|
if (ctx->queue == nil) {
|
||||||
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
|
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -1704,7 +1714,6 @@ struct ggml_backend_metal_buffer {
|
|||||||
struct ggml_backend_metal_buffer_context {
|
struct ggml_backend_metal_buffer_context {
|
||||||
void * all_data;
|
void * all_data;
|
||||||
size_t all_size;
|
size_t all_size;
|
||||||
bool owned;
|
|
||||||
|
|
||||||
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
||||||
int n_buffers;
|
int n_buffers;
|
||||||
@@ -1712,6 +1721,9 @@ struct ggml_backend_metal_buffer_context {
|
|||||||
|
|
||||||
// optional MTLResidencySet
|
// optional MTLResidencySet
|
||||||
id rset;
|
id rset;
|
||||||
|
|
||||||
|
id device;
|
||||||
|
id queue;
|
||||||
};
|
};
|
||||||
|
|
||||||
// rset init
|
// rset init
|
||||||
@@ -1777,7 +1789,7 @@ static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer
|
|||||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
//
|
//
|
||||||
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
|
static id<MTLBuffer> ggml_metal_get_buffer(const struct ggml_tensor * t, size_t * offs) {
|
||||||
//GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
//GGML_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||||
|
|
||||||
const int64_t tsize = ggml_nbytes(t);
|
const int64_t tsize = ggml_nbytes(t);
|
||||||
@@ -5932,14 +5944,6 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
|||||||
|
|
||||||
ggml_backend_metal_buffer_rset_free(ctx);
|
ggml_backend_metal_buffer_rset_free(ctx);
|
||||||
|
|
||||||
if (ctx->owned) {
|
|
||||||
#if TARGET_OS_OSX
|
|
||||||
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
|
|
||||||
#else
|
|
||||||
free(ctx->all_data);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -5950,25 +5954,112 @@ static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||||
|
#if 1
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
@autoreleasepool {
|
||||||
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
||||||
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
[cmd_buf enqueue];
|
||||||
|
|
||||||
|
size_t buf_dst_offset = 0;
|
||||||
|
id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
|
||||||
|
|
||||||
|
buf_dst_offset += offset;
|
||||||
|
|
||||||
|
[encoder fillBuffer:buf_dst
|
||||||
|
range:NSMakeRange(buf_dst_offset, buf_dst_offset + size)
|
||||||
|
value:value];
|
||||||
|
|
||||||
|
[encoder endEncoding];
|
||||||
|
|
||||||
|
[cmd_buf commit];
|
||||||
|
[cmd_buf waitUntilCompleted];
|
||||||
|
}
|
||||||
|
#else
|
||||||
memset((char *)tensor->data + offset, value, size);
|
memset((char *)tensor->data + offset, value, size);
|
||||||
|
#endif
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
|
#if 1
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
@autoreleasepool {
|
||||||
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
||||||
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
[cmd_buf enqueue];
|
||||||
|
|
||||||
|
// TODO: is this an extra copy? can we avoid it?
|
||||||
|
id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
|
||||||
|
length:size
|
||||||
|
options:MTLResourceStorageModeShared];
|
||||||
|
|
||||||
|
size_t buf_dst_offset = 0;
|
||||||
|
id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
|
||||||
|
|
||||||
|
buf_dst_offset += offset;
|
||||||
|
|
||||||
|
[encoder copyFromBuffer:buf_src
|
||||||
|
sourceOffset:0
|
||||||
|
toBuffer:buf_dst
|
||||||
|
destinationOffset:buf_dst_offset
|
||||||
|
size:size];
|
||||||
|
|
||||||
|
[encoder endEncoding];
|
||||||
|
|
||||||
|
[cmd_buf commit];
|
||||||
|
[cmd_buf waitUntilCompleted];
|
||||||
|
}
|
||||||
|
#else
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
#endif
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
|
#if 1
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
@autoreleasepool {
|
||||||
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
||||||
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
[cmd_buf enqueue];
|
||||||
|
|
||||||
|
size_t buf_src_offset = 0;
|
||||||
|
id<MTLBuffer> buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset);
|
||||||
|
|
||||||
|
buf_src_offset += offset;
|
||||||
|
|
||||||
|
id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
|
||||||
|
length:size
|
||||||
|
options:MTLResourceStorageModeShared
|
||||||
|
deallocator:nil];
|
||||||
|
|
||||||
|
[encoder copyFromBuffer:buf_src
|
||||||
|
sourceOffset:buf_src_offset
|
||||||
|
toBuffer:buf_dst
|
||||||
|
destinationOffset:0
|
||||||
|
size:size];
|
||||||
|
|
||||||
|
[encoder endEncoding];
|
||||||
|
|
||||||
|
[cmd_buf commit];
|
||||||
|
[cmd_buf waitUntilCompleted];
|
||||||
|
}
|
||||||
|
#else
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
#endif
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||||
|
GGML_ASSERT(false && "TODO");
|
||||||
memcpy(dst->data, src->data, ggml_nbytes(src));
|
memcpy(dst->data, src->data, ggml_nbytes(src));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -5980,7 +6071,22 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
|
|||||||
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
memset(ctx->all_data, value, ctx->all_size);
|
@autoreleasepool {
|
||||||
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
||||||
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
[cmd_buf enqueue];
|
||||||
|
|
||||||
|
[encoder fillBuffer:ctx->buffers[0].metal
|
||||||
|
range:NSMakeRange(0, ctx->buffers[0].size)
|
||||||
|
value:value];
|
||||||
|
|
||||||
|
[encoder endEncoding];
|
||||||
|
|
||||||
|
[cmd_buf commit];
|
||||||
|
[cmd_buf waitUntilCompleted];
|
||||||
|
}
|
||||||
|
|
||||||
|
//memset(ctx->all_data, value, ctx->all_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||||
@@ -6044,9 +6150,21 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|||||||
|
|
||||||
id<MTLDevice> device = ctx_dev->mtl_device;
|
id<MTLDevice> device = ctx_dev->mtl_device;
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
// TODO: tmp hack
|
||||||
|
static void * p_base = (void *) 0x000000400ULL;
|
||||||
|
|
||||||
|
ctx->all_data = p_base;
|
||||||
|
|
||||||
|
p_base = (void *) ((uintptr_t) p_base + size_aligned);
|
||||||
|
#else
|
||||||
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
||||||
|
#endif
|
||||||
ctx->all_size = size_aligned;
|
ctx->all_size = size_aligned;
|
||||||
ctx->owned = true;
|
|
||||||
|
ctx->device = device;
|
||||||
|
ctx->queue = ctx_dev->mtl_queue;
|
||||||
|
|
||||||
ctx->n_buffers = 1;
|
ctx->n_buffers = 1;
|
||||||
|
|
||||||
if (ctx->all_data != NULL) {
|
if (ctx->all_data != NULL) {
|
||||||
@@ -6055,10 +6173,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|||||||
ctx->buffers[0].metal = nil;
|
ctx->buffers[0].metal = nil;
|
||||||
|
|
||||||
if (size_aligned > 0) {
|
if (size_aligned > 0) {
|
||||||
|
#if 1
|
||||||
|
ctx->buffers[0].metal = [device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
|
||||||
|
#else
|
||||||
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
||||||
length:size_aligned
|
length:size_aligned
|
||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
deallocator:nil];
|
deallocator:nil];
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -6092,13 +6214,7 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
// TODO: not sure why, but without setting this to `false`, op offloading does not work correctly
|
return true;
|
||||||
// to reproduce, do the following:
|
|
||||||
//
|
|
||||||
// build with: cmake -DGGML_BLAS=OFF -DGGML_METAL=ON
|
|
||||||
// run: ./bin/llama-cli -m ggml-model-mxfp4.gguf -p "$(printf 'hello %.0s' {1..100})" --n-cpu-moe 10
|
|
||||||
//
|
|
||||||
return false;
|
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
@@ -6111,7 +6227,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|||||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
/* .is_host = */ NULL,
|
||||||
},
|
},
|
||||||
/* .device = */ &g_ggml_backend_metal_device,
|
/* .device = */ &g_ggml_backend_metal_device,
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
@@ -6130,7 +6246,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void)
|
|||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
|
static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
/* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
|
/* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ NULL,
|
||||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
@@ -6199,10 +6315,6 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, st
|
|||||||
struct ggml_backend_metal_context * ctx = backend->context;
|
struct ggml_backend_metal_context * ctx = backend->context;
|
||||||
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
|
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
|
||||||
|
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
||||||
|
|
||||||
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context;
|
|
||||||
|
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
id<MTLDevice> device = ctx_dev->mtl_device;
|
id<MTLDevice> device = ctx_dev->mtl_device;
|
||||||
|
|
||||||
@@ -6211,46 +6323,39 @@ static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, st
|
|||||||
length:size
|
length:size
|
||||||
options:MTLResourceStorageModeShared];
|
options:MTLResourceStorageModeShared];
|
||||||
|
|
||||||
size_t tensor_offset = (uintptr_t)tensor->data + offset;
|
size_t buf_dst_offset = 0;
|
||||||
|
id<MTLBuffer> buf_dst = ggml_metal_get_buffer(tensor, &buf_dst_offset);
|
||||||
|
|
||||||
// find which Metal buffer contains this tensor - we will copy into that buffer
|
if (buf_dst == nil) {
|
||||||
for (int i = 0; i < buf_ctx->n_buffers; i++) {
|
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
|
||||||
if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data &&
|
|
||||||
tensor_offset < (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) {
|
|
||||||
|
|
||||||
const size_t buf_dst_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data;
|
|
||||||
|
|
||||||
id<MTLBuffer> buf_dst = buf_ctx->buffers[i].metal;
|
|
||||||
|
|
||||||
// queue the copy operation into the queue of the Metal context
|
|
||||||
// this will be queued at the end, after any currently ongoing GPU operations
|
|
||||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
|
||||||
[cmd_buf enqueue];
|
|
||||||
|
|
||||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
||||||
|
|
||||||
[encoder copyFromBuffer:buf_src
|
|
||||||
sourceOffset:0
|
|
||||||
toBuffer:buf_dst
|
|
||||||
destinationOffset:buf_dst_offset
|
|
||||||
size:size];
|
|
||||||
|
|
||||||
[encoder endEncoding];
|
|
||||||
[cmd_buf commit];
|
|
||||||
|
|
||||||
// do not wait here for completion
|
|
||||||
//[cmd_buf waitUntilCompleted];
|
|
||||||
|
|
||||||
// instead, remember a reference to the command buffer and wait for it later if needed
|
|
||||||
[ctx->cmd_bufs_ext addObject:cmd_buf];
|
|
||||||
ctx->cmd_buf_ext_last = cmd_buf;
|
|
||||||
|
|
||||||
[cmd_buf retain];
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
|
buf_dst_offset += offset;
|
||||||
|
|
||||||
|
// queue the copy operation into the queue of the Metal context
|
||||||
|
// this will be queued at the end, after any currently ongoing GPU operations
|
||||||
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
||||||
|
[cmd_buf enqueue];
|
||||||
|
|
||||||
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
|
||||||
|
[encoder copyFromBuffer:buf_src
|
||||||
|
sourceOffset:0
|
||||||
|
toBuffer:buf_dst
|
||||||
|
destinationOffset:buf_dst_offset
|
||||||
|
size:size];
|
||||||
|
|
||||||
|
[encoder endEncoding];
|
||||||
|
[cmd_buf commit];
|
||||||
|
|
||||||
|
// do not wait here for completion
|
||||||
|
//[cmd_buf waitUntilCompleted];
|
||||||
|
|
||||||
|
// instead, remember a reference to the command buffer and wait for it later if needed
|
||||||
|
[ctx->cmd_bufs_ext addObject:cmd_buf];
|
||||||
|
ctx->cmd_buf_ext_last = cmd_buf;
|
||||||
|
|
||||||
|
[cmd_buf retain];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -6258,10 +6363,6 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const st
|
|||||||
struct ggml_backend_metal_context * ctx = backend->context;
|
struct ggml_backend_metal_context * ctx = backend->context;
|
||||||
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
|
struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
|
||||||
|
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
||||||
|
|
||||||
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *)buf->context;
|
|
||||||
|
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
id<MTLDevice> device = ctx_dev->mtl_device;
|
id<MTLDevice> device = ctx_dev->mtl_device;
|
||||||
|
|
||||||
@@ -6270,41 +6371,39 @@ static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const st
|
|||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
deallocator:nil];
|
deallocator:nil];
|
||||||
|
|
||||||
const size_t tensor_offset = (uintptr_t)tensor->data + offset;
|
size_t buf_src_offset = 0;
|
||||||
|
id<MTLBuffer> buf_src = ggml_metal_get_buffer(tensor, &buf_src_offset);
|
||||||
|
|
||||||
// find which buffer contains this tensor data
|
if (buf_src == nil) {
|
||||||
for (int i = 0; i < buf_ctx->n_buffers; i++) {
|
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
|
||||||
if (tensor_offset >= (uintptr_t) buf_ctx->buffers[i].data &&
|
|
||||||
tensor_offset < (uintptr_t) buf_ctx->buffers[i].data + buf_ctx->buffers[i].size) {
|
|
||||||
|
|
||||||
const size_t buf_src_offset = tensor_offset - (uintptr_t) buf_ctx->buffers[i].data;
|
|
||||||
|
|
||||||
id<MTLBuffer> buf_src = buf_ctx->buffers[i].metal;
|
|
||||||
|
|
||||||
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
|
||||||
[cmd_buf enqueue];
|
|
||||||
|
|
||||||
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
|
||||||
|
|
||||||
[encoder copyFromBuffer:buf_src
|
|
||||||
sourceOffset:buf_src_offset
|
|
||||||
toBuffer:buf_dst
|
|
||||||
destinationOffset:0
|
|
||||||
size:size];
|
|
||||||
|
|
||||||
[encoder endEncoding];
|
|
||||||
[cmd_buf commit];
|
|
||||||
//[cmd_buf waitUntilCompleted];
|
|
||||||
|
|
||||||
[ctx->cmd_bufs_ext addObject:cmd_buf];
|
|
||||||
ctx->cmd_buf_ext_last = cmd_buf;
|
|
||||||
|
|
||||||
[cmd_buf retain];
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ABORT("%s: failed to find buffer for tensor '%s'\n", __func__, tensor->name);
|
buf_src_offset += offset;
|
||||||
|
|
||||||
|
// queue the copy operation into the queue of the Metal context
|
||||||
|
// this will be queued at the end, after any currently ongoing GPU operations
|
||||||
|
id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
|
||||||
|
[cmd_buf enqueue];
|
||||||
|
|
||||||
|
id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
|
||||||
|
|
||||||
|
[encoder copyFromBuffer:buf_src
|
||||||
|
sourceOffset:buf_src_offset
|
||||||
|
toBuffer:buf_dst
|
||||||
|
destinationOffset:0
|
||||||
|
size:size];
|
||||||
|
|
||||||
|
[encoder endEncoding];
|
||||||
|
[cmd_buf commit];
|
||||||
|
|
||||||
|
// do not wait here for completion
|
||||||
|
//[cmd_buf waitUntilCompleted];
|
||||||
|
|
||||||
|
// instead, remember a reference to the command buffer and wait for it later if needed
|
||||||
|
[ctx->cmd_bufs_ext addObject:cmd_buf];
|
||||||
|
ctx->cmd_buf_ext_last = cmd_buf;
|
||||||
|
|
||||||
|
[cmd_buf retain];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -6513,8 +6612,8 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct g
|
|||||||
props->type = ggml_backend_metal_device_get_type(dev);
|
props->type = ggml_backend_metal_device_get_type(dev);
|
||||||
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
props->caps = (struct ggml_backend_dev_caps) {
|
props->caps = (struct ggml_backend_dev_caps) {
|
||||||
/* .async = */ false,
|
/* .async = */ true,
|
||||||
/* .host_buffer = */ false,
|
/* .host_buffer = */ true,
|
||||||
/* .buffer_from_host_ptr = */ true,
|
/* .buffer_from_host_ptr = */ true,
|
||||||
/* .events = */ false,
|
/* .events = */ false,
|
||||||
};
|
};
|
||||||
@@ -6554,7 +6653,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
|
|||||||
|
|
||||||
ctx->all_data = ptr;
|
ctx->all_data = ptr;
|
||||||
ctx->all_size = size;
|
ctx->all_size = size;
|
||||||
ctx->owned = false;
|
|
||||||
ctx->n_buffers = 0;
|
ctx->n_buffers = 0;
|
||||||
|
|
||||||
const size_t size_page = sysconf(_SC_PAGESIZE);
|
const size_t size_page = sysconf(_SC_PAGESIZE);
|
||||||
@@ -6577,6 +6676,9 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
|
|||||||
|
|
||||||
id<MTLDevice> device = ctx_dev->mtl_device;
|
id<MTLDevice> device = ctx_dev->mtl_device;
|
||||||
|
|
||||||
|
ctx->device = device;
|
||||||
|
ctx->queue = ctx_dev->mtl_queue;
|
||||||
|
|
||||||
// the buffer fits into the max buffer size allowed by the device
|
// the buffer fits into the max buffer size allowed by the device
|
||||||
if (size_aligned <= device.maxBufferLength) {
|
if (size_aligned <= device.maxBufferLength) {
|
||||||
ctx->buffers[ctx->n_buffers].data = ptr;
|
ctx->buffers[ctx->n_buffers].data = ptr;
|
||||||
|
|||||||
Reference in New Issue
Block a user