mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	fix: use vm_allocate to allocate CPU backend buffer on macOS (#9875)
				
					
				
			* fix: use `vm_allocate` to allocate CPU backend buffer on macOS * fix: switch to `posix_memalign` to keep existing `free()` usages work * feat: move `GGML_ALIGNED_MALLOC` to `ggml-backend-impl.h`, add support for `vm_allocate` on macOS * style: formatting * fix: move const outside of `#ifndef` * style: formatting * fix: unused var * fix: transform `GGML_ALIGNED_MALLOC` and `GGML_ALIGNED_FREE` into functions and add them to `ggml-impl.h` * fix: unused var * fix: page align to `GGUF_DEFAULT_ALIGNMENT` * fix: page align to `TENSOR_ALIGNMENT` * fix: convert `TENSOR_ALIGNMENT` to a macro * fix: increase page size to `32` on iOS * fix: iOS page size * fix: `hbw_posix_memalign` alignment
This commit is contained in:
		| @@ -682,8 +682,6 @@ ggml_backend_t ggml_backend_init_best(void) { | |||||||
|  |  | ||||||
| // backend CPU | // backend CPU | ||||||
|  |  | ||||||
| static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment |  | ||||||
|  |  | ||||||
| static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) { | static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) { | ||||||
|     return "CPU"; |     return "CPU"; | ||||||
|  |  | ||||||
| @@ -702,7 +700,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { | |||||||
| } | } | ||||||
|  |  | ||||||
| static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { | static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { | ||||||
|     free(buffer->context); |     ggml_aligned_free(buffer->context, buffer->size); | ||||||
| } | } | ||||||
|  |  | ||||||
| static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { | static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { | ||||||
| @@ -770,8 +768,8 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty | |||||||
| } | } | ||||||
|  |  | ||||||
| static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { | static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { | ||||||
|     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned |     void * data = ggml_aligned_malloc(size); | ||||||
|     void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h) |  | ||||||
|     if (data == NULL) { |     if (data == NULL) { | ||||||
|         GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); |         GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); | ||||||
|         return NULL; |         return NULL; | ||||||
|   | |||||||
| @@ -19,6 +19,9 @@ extern "C" { | |||||||
| #define MIN(a, b) ((a) < (b) ? (a) : (b)) | #define MIN(a, b) ((a) < (b) ? (a) : (b)) | ||||||
| #define MAX(a, b) ((a) > (b) ? (a) : (b)) | #define MAX(a, b) ((a) > (b) ? (a) : (b)) | ||||||
|  |  | ||||||
|  | // required for mmap as gguf only guarantees 32-byte alignment | ||||||
|  | #define TENSOR_ALIGNMENT 32 | ||||||
|  |  | ||||||
| // static_assert should be a #define, but if it's not, | // static_assert should be a #define, but if it's not, | ||||||
| // fall back to the _Static_assert C11 keyword. | // fall back to the _Static_assert C11 keyword. | ||||||
| // if C99 - static_assert is noop | // if C99 - static_assert is noop | ||||||
| @@ -196,6 +199,11 @@ struct ggml_cgraph { | |||||||
|  |  | ||||||
| struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); | struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); | ||||||
|  |  | ||||||
|  | // Memory allocation | ||||||
|  |  | ||||||
|  | void * ggml_aligned_malloc(size_t size); | ||||||
|  | void ggml_aligned_free(void * ptr, size_t size); | ||||||
|  |  | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -35,10 +35,6 @@ | |||||||
| #include <omp.h> | #include <omp.h> | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #ifdef GGML_USE_METAL |  | ||||||
| #include <unistd.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) | #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) | ||||||
| #undef GGML_USE_LLAMAFILE | #undef GGML_USE_LLAMAFILE | ||||||
| #endif | #endif | ||||||
| @@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t; | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #if defined(__APPLE__) | #if defined(__APPLE__) | ||||||
|  | #include <unistd.h> | ||||||
|  | #include <mach/mach.h> | ||||||
| #include <TargetConditionals.h> | #include <TargetConditionals.h> | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| @@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi | |||||||
| //#define GGML_SOFT_MAX_ACCELERATE | //#define GGML_SOFT_MAX_ACCELERATE | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
|  | void * ggml_aligned_malloc(size_t size) { | ||||||
| #if defined(_MSC_VER) || defined(__MINGW32__) | #if defined(_MSC_VER) || defined(__MINGW32__) | ||||||
| #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) |     return _aligned_malloc(size, TENSOR_ALIGNMENT); | ||||||
| #define GGML_ALIGNED_FREE(ptr)    _aligned_free(ptr) |  | ||||||
| #else | #else | ||||||
| inline static void * ggml_aligned_malloc(size_t size) { |  | ||||||
|     if (size == 0) { |     if (size == 0) { | ||||||
|         GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); |         GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); | ||||||
|         return NULL; |         return NULL; | ||||||
|     } |     } | ||||||
|     void * aligned_memory = NULL; |     void * aligned_memory = NULL; | ||||||
| #ifdef GGML_USE_CPU_HBM | #ifdef GGML_USE_CPU_HBM | ||||||
|     int result = hbw_posix_memalign(&aligned_memory, 16, size); |     int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); | ||||||
|  | #elif TARGET_OS_OSX | ||||||
|  |     kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE); | ||||||
|  |     int result = EFAULT; | ||||||
|  |     switch (alloc_status) { | ||||||
|  |         case KERN_SUCCESS: | ||||||
|  |             result = 0; | ||||||
|  |             break; | ||||||
|  |         case KERN_INVALID_ADDRESS: | ||||||
|  |             result = EINVAL; | ||||||
|  |             break; | ||||||
|  |         case KERN_NO_SPACE: | ||||||
|  |             result = ENOMEM; | ||||||
|  |             break; | ||||||
|  |         default: | ||||||
|  |             result = EFAULT; | ||||||
|  |             break; | ||||||
|  |     } | ||||||
| #elif GGML_USE_METAL | #elif GGML_USE_METAL | ||||||
|     int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size); |     const long page_size = sysconf(_SC_PAGESIZE); | ||||||
|  |     int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size); | ||||||
| #else | #else | ||||||
|     int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); |     int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); | ||||||
| #endif | #endif | ||||||
|     if (result != 0) { |     if (result != 0) { | ||||||
|         // Handle allocation failure |         // Handle allocation failure | ||||||
| @@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) { | |||||||
|         return NULL; |         return NULL; | ||||||
|     } |     } | ||||||
|     return aligned_memory; |     return aligned_memory; | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void ggml_aligned_free(void * ptr, size_t size) { | ||||||
|  |     GGML_UNUSED(size); | ||||||
|  | #if defined(_MSC_VER) || defined(__MINGW32__) | ||||||
|  |     _aligned_free(ptr); | ||||||
|  | #elif GGML_USE_CPU_HBM | ||||||
|  |     if (ptr != NULL) { | ||||||
|  |         hbw_free(ptr); | ||||||
|  |     } | ||||||
|  | #elif TARGET_OS_OSX | ||||||
|  |     if (ptr != NULL) { | ||||||
|  |         vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size); | ||||||
|     } |     } | ||||||
| #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) |  | ||||||
| #ifdef GGML_USE_CPU_HBM |  | ||||||
| #define GGML_ALIGNED_FREE(ptr)    if(NULL != ptr) hbw_free(ptr) |  | ||||||
| #else | #else | ||||||
| #define GGML_ALIGNED_FREE(ptr)    free(ptr) |     free(ptr); | ||||||
| #endif |  | ||||||
| #endif | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
| inline static void * ggml_malloc(size_t size) { | inline static void * ggml_malloc(size_t size) { | ||||||
|     if (size == 0) { |     if (size == 0) { | ||||||
| @@ -3869,7 +3897,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { | |||||||
|  |  | ||||||
|     *ctx = (struct ggml_context) { |     *ctx = (struct ggml_context) { | ||||||
|         /*.mem_size           =*/ mem_size, |         /*.mem_size           =*/ mem_size, | ||||||
|         /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size), |         /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size), | ||||||
|         /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true, |         /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true, | ||||||
|         /*.no_alloc           =*/ params.no_alloc, |         /*.no_alloc           =*/ params.no_alloc, | ||||||
|         /*.no_alloc_save      =*/ params.no_alloc, |         /*.no_alloc_save      =*/ params.no_alloc, | ||||||
| @@ -3909,7 +3937,7 @@ void ggml_free(struct ggml_context * ctx) { | |||||||
|                     __func__, i, ggml_used_mem(ctx)); |                     __func__, i, ggml_used_mem(ctx)); | ||||||
|  |  | ||||||
|             if (ctx->mem_buffer_owned) { |             if (ctx->mem_buffer_owned) { | ||||||
|                 GGML_ALIGNED_FREE(ctx->mem_buffer); |                 ggml_aligned_free(ctx->mem_buffer, ctx->mem_size); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             found = true; |             found = true; | ||||||
| @@ -19608,9 +19636,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask | |||||||
| void ggml_threadpool_free(struct ggml_threadpool* threadpool) { | void ggml_threadpool_free(struct ggml_threadpool* threadpool) { | ||||||
|     if (!threadpool) return; |     if (!threadpool) return; | ||||||
|  |  | ||||||
|  |     const int n_threads = threadpool->n_threads_max; | ||||||
|  |  | ||||||
| #ifndef GGML_USE_OPENMP | #ifndef GGML_USE_OPENMP | ||||||
|     struct ggml_compute_state* workers = threadpool->workers; |     struct ggml_compute_state* workers = threadpool->workers; | ||||||
|     const int n_threads = threadpool->n_threads_max; |  | ||||||
|  |  | ||||||
|     ggml_mutex_lock(&threadpool->mutex); |     ggml_mutex_lock(&threadpool->mutex); | ||||||
|  |  | ||||||
| @@ -19630,8 +19659,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) { | |||||||
|     ggml_cond_destroy(&threadpool->cond); |     ggml_cond_destroy(&threadpool->cond); | ||||||
| #endif // GGML_USE_OPENMP | #endif // GGML_USE_OPENMP | ||||||
|  |  | ||||||
|     GGML_ALIGNED_FREE(threadpool->workers); |     const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads; | ||||||
|     GGML_ALIGNED_FREE(threadpool); |     ggml_aligned_free(threadpool->workers, workers_size); | ||||||
|  |     ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool)); | ||||||
| } | } | ||||||
|  |  | ||||||
| #ifndef GGML_USE_OPENMP | #ifndef GGML_USE_OPENMP | ||||||
| @@ -20063,7 +20093,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( | |||||||
|                 struct ggml_cplan * cplan) { |                 struct ggml_cplan * cplan) { | ||||||
|  |  | ||||||
|     struct ggml_threadpool * threadpool = |     struct ggml_threadpool * threadpool = | ||||||
|         GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool)); |         ggml_aligned_malloc(sizeof(struct ggml_threadpool)); | ||||||
|     { |     { | ||||||
|         threadpool->cgraph           = cgraph; |         threadpool->cgraph           = cgraph; | ||||||
|         threadpool->cplan            = cplan; |         threadpool->cplan            = cplan; | ||||||
| @@ -20084,7 +20114,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( | |||||||
|  |  | ||||||
|     // Allocate and init workers state |     // Allocate and init workers state | ||||||
|     const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads; |     const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads; | ||||||
|     struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size); |     struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size); | ||||||
|  |  | ||||||
|     memset(workers, 0, workers_size); |     memset(workers, 0, workers_size); | ||||||
|     for (int j = 0; j < tpp->n_threads; j++) { |     for (int j = 0; j < tpp->n_threads; j++) { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Gilad S.
					Gilad S.