mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	metal : enable ggml-alloc (#2627)
* metal: enable ggml-alloc Make ggml-alloc work with concurrently dispatch. * style-fix Co-authored-by: slaren <slarengh@gmail.com> --------- Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										25
									
								
								ggml-alloc.c
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								ggml-alloc.c
									
									
									
									
									
								
							| @@ -67,6 +67,8 @@ struct ggml_allocr { | ||||
|     struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE]; | ||||
|     size_t max_size; | ||||
|     bool measure; | ||||
|     int parse_seq[GGML_MAX_NODES]; | ||||
|     bool has_parse_seq; | ||||
|  | ||||
| #ifdef GGML_ALLOCATOR_DEBUG | ||||
|     struct ggml_tensor * allocated_tensors[1024]; | ||||
| @@ -229,6 +231,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t | ||||
|     alloc->n_free_blocks++; | ||||
| } | ||||
|  | ||||
| void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) { | ||||
|     int pos = 0; | ||||
|     for (int i = 0; i < n; i++) { | ||||
|         if (list[i] != -1) { | ||||
|             alloc->parse_seq[pos] = list[i]; | ||||
|             pos++; | ||||
|         } | ||||
|     } | ||||
|     alloc->has_parse_seq = true; | ||||
| } | ||||
|  | ||||
| void ggml_allocr_reset(struct ggml_allocr * alloc) { | ||||
|     alloc->n_free_blocks = 1; | ||||
|     size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); | ||||
| @@ -248,6 +261,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) | ||||
|         /*.hash_table    = */ {{0}}, | ||||
|         /*.max_size      = */ 0, | ||||
|         /*.measure       = */ false, | ||||
|         /*.parse_seq     = */ {0}, | ||||
|         /*.has_parse_seq = */ false, | ||||
| #ifdef GGML_ALLOCATOR_DEBUG | ||||
|         /*.allocated_tensors = */ = {0}, | ||||
| #endif | ||||
| @@ -275,6 +290,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { | ||||
|         /*.hash_table    = */ {{0}}, | ||||
|         /*.max_size      = */ 0, | ||||
|         /*.measure       = */ true, | ||||
|         /*.parse_seq     = */ {0}, | ||||
|         /*.has_parse_seq = */ false, | ||||
| #ifdef GGML_ALLOCATOR_DEBUG | ||||
|         /*.allocated_tensors = */ = {0}, | ||||
| #endif | ||||
| @@ -473,7 +490,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n( | ||||
|                 allocate_node(alloc, input); | ||||
|             } | ||||
|         } | ||||
|         for (int i = 0; i < gf->n_nodes; i++) { | ||||
|         for (int ind = 0; ind < gf->n_nodes; ind++) { | ||||
|             int i; | ||||
|             if (alloc->has_parse_seq) { | ||||
|                 i = alloc->parse_seq[ind]; | ||||
|             } else { | ||||
|                 i = ind; | ||||
|             } | ||||
|             struct ggml_tensor * node = gf->nodes[i]; | ||||
|  | ||||
|             // allocate parents (leafs) | ||||
|   | ||||
| @@ -10,6 +10,10 @@ extern "C" { | ||||
| GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); | ||||
| GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); | ||||
|  | ||||
| // tell the allocator to parse nodes following the order described in the list | ||||
| // you should call this if your graph are optimized to execute out-of-order | ||||
| GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n); | ||||
|  | ||||
| GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc); | ||||
| GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc); | ||||
| GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc); | ||||
|   | ||||
| @@ -63,10 +63,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * | ||||
|  | ||||
| // try to find operations that can be run concurrently in the graph | ||||
| // you should run it again if the topology of your graph changes | ||||
| void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); | ||||
| void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); | ||||
|  | ||||
| // if the graph has been optimized for concurrently dispatch | ||||
| bool ggml_metal_if_optimized(struct ggml_metal_context * ctx); | ||||
| // if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized | ||||
| int ggml_metal_if_optimized(struct ggml_metal_context * ctx); | ||||
|  | ||||
| // output the concur_list for ggml_alloc | ||||
| int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); | ||||
|  | ||||
| // same as ggml_graph_compute but uses Metal | ||||
| // creates gf->n_threads command buffers in parallel | ||||
|   | ||||
							
								
								
									
										15
									
								
								ggml-metal.m
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								ggml-metal.m
									
									
									
									
									
								
							| @@ -236,11 +236,12 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { | ||||
|     ctx->n_cb = n_cb; | ||||
| } | ||||
|  | ||||
| bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) { | ||||
|     if (ctx->concur_list_len) { | ||||
|         return true; | ||||
|     } | ||||
|     return false; | ||||
| int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { | ||||
|     return ctx->concur_list_len; | ||||
| } | ||||
|  | ||||
| int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { | ||||
|     return ctx->concur_list; | ||||
| } | ||||
|  | ||||
| // finds the Metal buffer that contains the tensor data on the GPU device | ||||
| @@ -383,7 +384,7 @@ void ggml_metal_get_tensor( | ||||
|  | ||||
| void ggml_metal_graph_find_concurrency( | ||||
|         struct ggml_metal_context * ctx, | ||||
|         struct ggml_cgraph * gf) { | ||||
|         struct ggml_cgraph * gf, bool check_mem) { | ||||
|     int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time | ||||
|     int nodes_unused[GGML_MAX_CONCUR]; | ||||
|  | ||||
| @@ -430,7 +431,7 @@ void ggml_metal_graph_find_concurrency( | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 if (exe_flag) { | ||||
|                 if (exe_flag && check_mem) { | ||||
|                     // check if nodes[i]'s data will be overwritten by a node before nodes[i]. | ||||
|                     // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] | ||||
|                     int64_t data_start = (int64_t) gf->nodes[i]->data; | ||||
|   | ||||
							
								
								
									
										34
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -63,7 +63,7 @@ static void llama_log_callback_default(llama_log_level level, const char * text, | ||||
| #define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__) | ||||
|  | ||||
|  | ||||
| #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) | ||||
| #if !defined(GGML_USE_CUBLAS) | ||||
| #include "ggml-alloc.h" | ||||
| #define LLAMA_USE_ALLOCATOR | ||||
| #else | ||||
| @@ -1846,10 +1846,6 @@ static bool llama_eval_internal( | ||||
|  | ||||
| #ifdef GGML_USE_METAL | ||||
|     if (lctx.ctx_metal) { | ||||
|         // TODO: disabled until #2413 is resolved | ||||
|         //if (!ggml_metal_if_optimized(lctx.ctx_metal)) { | ||||
|         //    ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf); | ||||
|         //} | ||||
|         ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads); | ||||
|         ggml_metal_graph_compute(lctx.ctx_metal, gf); | ||||
|         ggml_metal_get_tensor   (lctx.ctx_metal, res); | ||||
| @@ -3287,7 +3283,18 @@ struct llama_context * llama_new_context_with_model( | ||||
|             int n_past = hparams.n_ctx - n_tokens; | ||||
|             llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph | ||||
|             ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past); | ||||
|  | ||||
| #ifdef GGML_USE_METAL | ||||
|             if (params.n_gpu_layers > 0) { | ||||
|                 ctx->ctx_metal = ggml_metal_init(1); | ||||
|                 if (!ctx->ctx_metal) { | ||||
|                     LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); | ||||
|                     llama_free(ctx); | ||||
|                     return NULL; | ||||
|                 } | ||||
|                 ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false); | ||||
|                 ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); | ||||
|             } | ||||
| #endif | ||||
|             // measure memory requirements for the graph | ||||
|             size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment; | ||||
|  | ||||
| @@ -3305,6 +3312,11 @@ struct llama_context * llama_new_context_with_model( | ||||
|  | ||||
|             ctx->buf_alloc.resize(alloc_size); | ||||
|             ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment); | ||||
| #ifdef GGML_USE_METAL | ||||
|             if (ctx->ctx_metal) { | ||||
|                 ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal)); | ||||
|             } | ||||
| #endif | ||||
|         } | ||||
| #else | ||||
|         ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead()); | ||||
| @@ -3319,13 +3331,6 @@ struct llama_context * llama_new_context_with_model( | ||||
| #ifdef GGML_USE_METAL | ||||
|     if (params.n_gpu_layers > 0) { | ||||
|         // this allocates all Metal resources and memory buffers | ||||
|         ctx->ctx_metal = ggml_metal_init(1); | ||||
|  | ||||
|         if (!ctx->ctx_metal) { | ||||
|             LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); | ||||
|             llama_free(ctx); | ||||
|             return NULL; | ||||
|         } | ||||
|  | ||||
|         void * data_ptr  = NULL; | ||||
|         size_t data_size = 0; | ||||
| @@ -3354,8 +3359,7 @@ struct llama_context * llama_new_context_with_model( | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0)); | ||||
|  | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); | ||||
|         LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0)); | ||||
| #undef LLAMA_METAL_CHECK_BUF | ||||
|     } | ||||
| #endif | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Shouzheng Liu
					Shouzheng Liu