mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
cont : put all buffers in the same virtual address space
ggml-ci
This commit is contained in:
@@ -43,6 +43,9 @@ static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
|
|||||||
static struct ggml_backend_reg g_ggml_backend_metal_reg;
|
static struct ggml_backend_reg g_ggml_backend_metal_reg;
|
||||||
static struct ggml_backend_device g_ggml_backend_metal_device;
|
static struct ggml_backend_device g_ggml_backend_metal_device;
|
||||||
|
|
||||||
|
// virtual address for GPU memory allocations
|
||||||
|
static atomic_uintptr_t g_addr_device = 0x000000400ULL;
|
||||||
|
|
||||||
// information about a Metal device
|
// information about a Metal device
|
||||||
// note: assumes single GPU device - the default one
|
// note: assumes single GPU device - the default one
|
||||||
// TODO: support multiple GPU devices
|
// TODO: support multiple GPU devices
|
||||||
@@ -1787,9 +1790,11 @@ struct ggml_backend_metal_buffer {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_metal_buffer_context {
|
struct ggml_backend_metal_buffer_context {
|
||||||
void * all_data;
|
void * all_data; // for shared buffers
|
||||||
size_t all_size;
|
size_t all_size;
|
||||||
|
|
||||||
|
void * base_addr;
|
||||||
|
|
||||||
// if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host
|
// if false, the Metal buffer data is allocated in private GPU memory and is not shared with the host
|
||||||
bool is_shared;
|
bool is_shared;
|
||||||
|
|
||||||
@@ -6035,33 +6040,42 @@ static void ggml_backend_metal_buffer_shared_free_buffer(ggml_backend_buffer_t b
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_backend_metal_buffer_shared_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_metal_buffer_shared_get_base(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
||||||
|
|
||||||
return ctx->all_data;
|
return ctx->base_addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_shared_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_shared_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *) tensor->buffer->context;
|
||||||
|
|
||||||
GGML_ASSERT(ctx->is_shared);
|
GGML_ASSERT(ctx->is_shared);
|
||||||
|
|
||||||
memset((char *)tensor->data + offset, value, size);
|
const ptrdiff_t base_offset = (char *)tensor->data - (char *)ctx->base_addr;
|
||||||
|
memset((char *) ctx->all_data + base_offset + offset, value, size);
|
||||||
|
|
||||||
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_shared_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_shared_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *) tensor->buffer->context;
|
||||||
|
|
||||||
GGML_ASSERT(ctx->is_shared);
|
GGML_ASSERT(ctx->is_shared);
|
||||||
|
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
const ptrdiff_t base_offset = (char *)tensor->data - (char *)ctx->base_addr;
|
||||||
|
memcpy((char *) ctx->all_data + base_offset + offset, data, size);
|
||||||
|
|
||||||
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_shared_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_shared_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *) tensor->buffer->context;
|
||||||
|
|
||||||
GGML_ASSERT(ctx->is_shared);
|
GGML_ASSERT(ctx->is_shared);
|
||||||
|
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
const ptrdiff_t base_offset = (char *)tensor->data - (char *)ctx->base_addr;
|
||||||
|
memcpy(data, (const char *) ctx->all_data + base_offset + offset, size);
|
||||||
|
|
||||||
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_buffer_shared_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
static bool ggml_backend_metal_buffer_shared_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
@@ -6111,7 +6125,7 @@ static void ggml_backend_metal_buffer_private_free_buffer(ggml_backend_buffer_t
|
|||||||
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_metal_buffer_private_get_base(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
return ctx->all_data;
|
return ctx->base_addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_private_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_private_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||||
@@ -6329,14 +6343,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|||||||
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
||||||
ctx->is_shared = true;
|
ctx->is_shared = true;
|
||||||
} else {
|
} else {
|
||||||
// virtual address for GPU memory allocations
|
// dummy, non-NULL value - not used
|
||||||
static atomic_uintptr_t addr_device = 0x000000400ULL;
|
ctx->all_data = (void *) 0x000000400ULL;
|
||||||
|
|
||||||
ctx->all_data = (void *) atomic_fetch_add_explicit(&addr_device, size_aligned, memory_order_relaxed);
|
|
||||||
ctx->is_shared = false;
|
ctx->is_shared = false;
|
||||||
}
|
}
|
||||||
ctx->all_size = size_aligned;
|
ctx->all_size = size_aligned;
|
||||||
|
|
||||||
|
ctx->base_addr = (void *) atomic_fetch_add_explicit(&g_addr_device, size_aligned, memory_order_relaxed);
|
||||||
|
|
||||||
ctx->device = device;
|
ctx->device = device;
|
||||||
ctx->queue = ctx_dev->mtl_queue;
|
ctx->queue = ctx_dev->mtl_queue;
|
||||||
|
|
||||||
@@ -6347,7 +6361,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|||||||
ctx->buffers[0].metal = nil;
|
ctx->buffers[0].metal = nil;
|
||||||
|
|
||||||
if (size_aligned > 0) {
|
if (size_aligned > 0) {
|
||||||
if (ctx_dev->use_shared_buffers) {
|
if (ctx_dev->use_shared_buffers && shared) {
|
||||||
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
||||||
length:size_aligned
|
length:size_aligned
|
||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
@@ -6362,7 +6376,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->buffers[0].data = ctx->all_data;
|
ctx->buffers[0].data = ctx->base_addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
|
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
|
||||||
@@ -6963,6 +6977,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen
|
|||||||
size_aligned += (size_page - (size_aligned % size_page));
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// note: I think this is preferred because we want to have both the mapped and non-mapped buffers in the same
|
||||||
|
// address space. not sure if there are any side-effects from this though.
|
||||||
|
//ctx->base_addr = (void *) atomic_fetch_add_explicit(&g_addr_device, size_aligned, memory_order_relaxed);
|
||||||
|
//
|
||||||
|
// note2: the above does not actually work
|
||||||
|
ctx->base_addr = ptr;
|
||||||
|
|
||||||
struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context;
|
struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)dev->context;
|
||||||
|
|
||||||
GGML_ASSERT(ctx_dev->mtl_device != nil);
|
GGML_ASSERT(ctx_dev->mtl_device != nil);
|
||||||
@@ -6974,7 +6995,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen
|
|||||||
|
|
||||||
// the buffer fits into the max buffer size allowed by the device
|
// the buffer fits into the max buffer size allowed by the device
|
||||||
if (size_aligned <= device.maxBufferLength) {
|
if (size_aligned <= device.maxBufferLength) {
|
||||||
ctx->buffers[ctx->n_buffers].data = ptr;
|
ctx->buffers[ctx->n_buffers].data = ctx->base_addr;
|
||||||
ctx->buffers[ctx->n_buffers].size = size;
|
ctx->buffers[ctx->n_buffers].size = size;
|
||||||
ctx->buffers[ctx->n_buffers].metal = nil;
|
ctx->buffers[ctx->n_buffers].metal = nil;
|
||||||
|
|
||||||
@@ -7000,7 +7021,7 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_mapped(ggml_backen
|
|||||||
for (size_t i = 0; i < size; i += size_step) {
|
for (size_t i = 0; i < size; i += size_step) {
|
||||||
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
||||||
|
|
||||||
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) ptr + i);
|
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) ctx->base_addr + i);
|
||||||
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
||||||
ctx->buffers[ctx->n_buffers].metal = nil;
|
ctx->buffers[ctx->n_buffers].metal = nil;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user