mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	move BLAS to a separate backend (#6210)
* move BLAS to a separate backend * rename GGML_USE_OPENBLAS to GGML_USE_BLAS * alloc : reuse same buffer when the same buffer type if used multiple times * set number of threads automatically for openblas and blis * sched : print assignments when GGML_SCHED_DEBUG env variable is set * sched : allow ops with weights on an incompatible buffer type This will cause the weight to be copied to a backend that supports the op, which is very costly. The weight should have been stored in a buffer of a backend that can run the op, but llama.cpp cannot do this automatically at the moment. --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										98
									
								
								ggml-alloc.c
									
									
									
									
									
								
							
							
						
						
									
										98
									
								
								ggml-alloc.c
									
									
									
									
									
								
							| @@ -339,6 +339,7 @@ struct hash_node { | ||||
| }; | ||||
|  | ||||
| struct tensor_alloc { | ||||
|     int buffer_id; | ||||
|     size_t offset; | ||||
|     size_t size_max; // 0 = pre-allocated, unused, or view | ||||
| }; | ||||
| @@ -349,7 +350,6 @@ struct leaf_alloc { | ||||
| }; | ||||
|  | ||||
| struct node_alloc { | ||||
|     int buffer_id; | ||||
|     struct tensor_alloc dst; | ||||
|     struct tensor_alloc src[GGML_MAX_SRC]; | ||||
| }; | ||||
| @@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs | ||||
|     for (int i = 0; i < n_bufs; i++) { | ||||
|         galloc->bufts[i] = bufts[i]; | ||||
|         galloc->buffers[i] = NULL; | ||||
|         size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); | ||||
|         galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); | ||||
|  | ||||
|         // check if the same buffer type is used multiple times and reuse the same allocator | ||||
|         for (int j = 0; j < i; j++) { | ||||
|             if (bufts[i] == bufts[j]) { | ||||
|                 galloc->buf_tallocs[i] = galloc->buf_tallocs[j]; | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if (galloc->buf_tallocs[i] == NULL) { | ||||
|             size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); | ||||
|             galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); | ||||
|         } | ||||
|     } | ||||
|     galloc->n_buffers = n_bufs; | ||||
|  | ||||
| @@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { | ||||
|  | ||||
|     for (int i = 0; i < galloc->n_buffers; i++) { | ||||
|         if (galloc->buffers != NULL) { | ||||
|             ggml_backend_buffer_free(galloc->buffers[i]); | ||||
|             // skip if already freed | ||||
|             bool freed = false; | ||||
|             for (int j = 0; j < i; j++) { | ||||
|                 if (galloc->buffers[j] == galloc->buffers[i]) { | ||||
|                     freed = true; | ||||
|                     break; | ||||
|                 } | ||||
|             } | ||||
|             if (!freed) { | ||||
|                 ggml_backend_buffer_free(galloc->buffers[i]); | ||||
|             } | ||||
|         } | ||||
|         if (galloc->buf_tallocs != NULL) { | ||||
|             ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); | ||||
|             // skip if already freed | ||||
|             bool freed = false; | ||||
|             for (int j = 0; j < i; j++) { | ||||
|                 if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { | ||||
|                     freed = true; | ||||
|                     break; | ||||
|                 } | ||||
|             } | ||||
|             if (!freed) { | ||||
|                 ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { | ||||
| static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { | ||||
|     // graph outputs are never freed | ||||
|     if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { | ||||
|         AT_PRINTF("not freeing output %s\n", node->name); | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; | ||||
|     ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; | ||||
|     struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); | ||||
|     size_t offset = hn->offset; | ||||
|     int buffer_id = hn->buffer_id; | ||||
|     struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; | ||||
|     ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; | ||||
|     size_t size = ggml_backend_buft_get_alloc_size(buft, node); | ||||
|     ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); | ||||
|     hn->allocated = false; | ||||
| @@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr | ||||
|                     AT_PRINTF("view_src %s: %d children, %d views\n", | ||||
|                         view_src->name, view_src_hn->n_children, view_src_hn->n_views); | ||||
|                     if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { | ||||
|                         ggml_gallocr_free_node(galloc, view_src, buffer_id); | ||||
|                         ggml_gallocr_free_node(galloc, view_src); | ||||
|                     } | ||||
|                 } | ||||
|                 else if (p_hn->allocated) { | ||||
|                     ggml_gallocr_free_node(galloc, parent, buffer_id); | ||||
|                     ggml_gallocr_free_node(galloc, parent); | ||||
|                 } | ||||
|             } | ||||
|             AT_PRINTF("\n"); | ||||
| @@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c | ||||
|     for (int i = 0; i < graph->n_nodes; i++) { | ||||
|         struct ggml_tensor * node = graph->nodes[i]; | ||||
|         struct node_alloc * node_alloc = &galloc->node_allocs[i]; | ||||
|         node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); | ||||
|         if (node->view_src || node->data) { | ||||
|             node_alloc->dst.buffer_id = -1; | ||||
|             node_alloc->dst.offset = SIZE_MAX; | ||||
|             node_alloc->dst.size_max = 0; | ||||
|         } else { | ||||
|             struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); | ||||
|             node_alloc->dst.offset   = hn->offset; | ||||
|             node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); | ||||
|             node_alloc->dst.buffer_id = hn->buffer_id; | ||||
|             node_alloc->dst.offset    = hn->offset; | ||||
|             node_alloc->dst.size_max  = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); | ||||
|         } | ||||
|         for (int j = 0; j < GGML_MAX_SRC; j++) { | ||||
|             struct ggml_tensor * src = node->src[j]; | ||||
|             if (!src || src->view_src || src->data) { | ||||
|                 node_alloc->src[j].buffer_id = -1; | ||||
|                 node_alloc->src[j].offset = SIZE_MAX; | ||||
|                 node_alloc->src[j].size_max = 0; | ||||
|             } else { | ||||
|                 struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); | ||||
|                 node_alloc->src[j].buffer_id = hn->buffer_id; | ||||
|                 node_alloc->src[j].offset   = hn->offset; | ||||
|                 node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); | ||||
|             } | ||||
| @@ -706,9 +741,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c | ||||
|         struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); | ||||
|         galloc->leaf_allocs[i].buffer_id = hn->buffer_id; | ||||
|         if (leaf->view_src || leaf->data) { | ||||
|             galloc->leaf_allocs[i].leaf.buffer_id = -1; | ||||
|             galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; | ||||
|             galloc->leaf_allocs[i].leaf.size_max = 0; | ||||
|         } else { | ||||
|             galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id; | ||||
|             galloc->leaf_allocs[i].leaf.offset = hn->offset; | ||||
|             galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); | ||||
|         } | ||||
| @@ -716,6 +753,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c | ||||
|  | ||||
|     // reallocate buffers if needed | ||||
|     for (int i = 0; i < galloc->n_buffers; i++) { | ||||
|         // if the buffer type is used multiple times, we reuse the same buffer | ||||
|         for (int j = 0; j < i; j++) { | ||||
|             if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { | ||||
|                 galloc->buffers[i] = galloc->buffers[j]; | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; | ||||
|         size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); | ||||
|  | ||||
| @@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c | ||||
| #ifndef NDEBUG | ||||
|             fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); | ||||
| #endif | ||||
|  | ||||
|             ggml_backend_buffer_free(galloc->buffers[i]); | ||||
|             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); | ||||
|             if (galloc->buffers[i] == NULL) { | ||||
| @@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { | ||||
|     return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); | ||||
| } | ||||
|  | ||||
| static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) { | ||||
| static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) { | ||||
|     int buffer_id = tensor_alloc->buffer_id; | ||||
|     assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); | ||||
|  | ||||
|     if (tensor->view_src != NULL) { | ||||
| @@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * | ||||
|     } | ||||
| } | ||||
|  | ||||
| static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) { | ||||
|     ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id]; | ||||
| static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { | ||||
|     ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL; | ||||
|     size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); | ||||
|     return talloc->size_max >= node_size; | ||||
| } | ||||
| @@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph | ||||
|         struct ggml_tensor * node = graph->nodes[i]; | ||||
|         struct node_alloc * node_alloc = &galloc->node_allocs[i]; | ||||
|  | ||||
|         if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) { | ||||
|         if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) { | ||||
| #ifndef NDEBUG | ||||
|             fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); | ||||
| #endif | ||||
| @@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph | ||||
|             if (src == NULL) { | ||||
|                 continue; | ||||
|             } | ||||
|             if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) { | ||||
|             if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) { | ||||
| #ifndef NDEBUG | ||||
|                 fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); | ||||
| #endif | ||||
| @@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) | ||||
|     for (int i = 0; i < graph->n_leafs; i++) { | ||||
|         struct ggml_tensor * leaf = graph->leafs[i]; | ||||
|         struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i]; | ||||
|         ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf); | ||||
|         ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf); | ||||
|     } | ||||
|     // nodes | ||||
|     for (int i = 0; i < graph->n_nodes; i++) { | ||||
| @@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) | ||||
|             if (src == NULL) { | ||||
|                 continue; | ||||
|             } | ||||
|             ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]); | ||||
|             ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]); | ||||
|         } | ||||
|         ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); | ||||
|         ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst); | ||||
|     } | ||||
|  | ||||
|     return true; | ||||
| @@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { | ||||
|     if (galloc->buffers[buffer_id] == NULL) { | ||||
|         return 0; | ||||
|     } | ||||
|  | ||||
|     for (int i = 0; i < buffer_id; i++) { | ||||
|         if (galloc->buffers[i] == galloc->buffers[buffer_id]) { | ||||
|             // this buffer is the same as a previous one due to the same buffer type being used multiple times | ||||
|             // only return the buffer size the first time it appears to avoid double counting | ||||
|             return 0; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 slaren
					slaren