mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	sync : ggml (backend v2) (#3912)
* sync : ggml (backend v2) (wip) * sync : migrate examples and llama.cpp to dynamic graphs (wip) * sync : update tests + fix max op params to 64 ggml-ci * sync : ggml-cuda ggml-ci * llama : fix save/load state context size ggml-ci * sync : try to fix build on tvOS * sync : pass custom graph sizes in training examples * sync : update graph copies to new ggml API * sync : update sync-ggml.sh with new files * scripts : fix header in sync script * train : fix context size calculations * llama : increase inference graph size up to 4096 nodes * train : allocate grads for backward graphs * train : allocate grads for gb_tmp
This commit is contained in:
		
							
								
								
									
										591
									
								
								ggml-backend.c
									
									
									
									
									
								
							
							
						
						
									
										591
									
								
								ggml-backend.c
									
									
									
									
									
								
							| @@ -1,7 +1,9 @@ | ||||
| #include "ggml-backend.h" | ||||
| #include "ggml-backend-impl.h" | ||||
| #include "ggml-alloc.h" | ||||
| #include "ggml-impl.h" | ||||
|  | ||||
| #include <assert.h> | ||||
| #include <limits.h> | ||||
| #include <stdarg.h> | ||||
| #include <stdio.h> | ||||
| #include <stdlib.h> | ||||
| @@ -33,6 +35,10 @@ ggml_backend_buffer_t ggml_backend_buffer_init( | ||||
| } | ||||
|  | ||||
| void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { | ||||
|     if (buffer == NULL) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     if (buffer->iface.free_buffer != NULL) { | ||||
|         buffer->iface.free_buffer(buffer); | ||||
|     } | ||||
| @@ -43,15 +49,20 @@ size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { | ||||
|     return ggml_backend_get_alignment(buffer->backend); | ||||
| } | ||||
|  | ||||
| void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { | ||||
|     return buffer->iface.get_base(buffer); | ||||
| } | ||||
|  | ||||
| size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { | ||||
|     return buffer->size; | ||||
| } | ||||
|  | ||||
| void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { | ||||
|     void * base = buffer->iface.get_base(buffer); | ||||
|  | ||||
|     GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL"); | ||||
|  | ||||
|     return base; | ||||
| } | ||||
|  | ||||
| size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { | ||||
|     // get_alloc_size is optional, defaults to ggml_nbytes | ||||
|     if (buffer->iface.get_alloc_size) { | ||||
|         return buffer->iface.get_alloc_size(buffer, tensor); | ||||
|     } | ||||
| @@ -59,12 +70,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g | ||||
| } | ||||
|  | ||||
| void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { | ||||
|     // init_tensor is optional | ||||
|     if (buffer->iface.init_tensor) { | ||||
|         buffer->iface.init_tensor(buffer, tensor); | ||||
|     } | ||||
| } | ||||
|  | ||||
| void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { | ||||
|     // free_tensor is optional | ||||
|     if (buffer->iface.free_tensor) { | ||||
|         buffer->iface.free_tensor(buffer, tensor); | ||||
|     } | ||||
| @@ -73,14 +86,21 @@ void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_t | ||||
| // backend | ||||
|  | ||||
| ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) { | ||||
|     return tensor->buffer->backend; | ||||
|     return tensor->buffer ? tensor->buffer->backend : NULL; | ||||
| } | ||||
|  | ||||
| const char * ggml_backend_name(ggml_backend_t backend) { | ||||
|     if (backend == NULL) { | ||||
|         return "NULL"; | ||||
|     } | ||||
|     return backend->iface.get_name(backend); | ||||
| } | ||||
|  | ||||
| void ggml_backend_free(ggml_backend_t backend) { | ||||
|     if (backend == NULL) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     backend->iface.free(backend); | ||||
| } | ||||
|  | ||||
| @@ -101,13 +121,23 @@ void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * dat | ||||
| } | ||||
|  | ||||
| void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { | ||||
|     ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); | ||||
|     ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor)); | ||||
|     ggml_backend_t backend = ggml_get_backend(tensor); | ||||
|  | ||||
|     GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); | ||||
|     GGML_ASSERT(backend != NULL && "tensor backend not set"); | ||||
|  | ||||
|     backend->iface.set_tensor_async(backend, tensor, data, offset, size); | ||||
|     backend->iface.synchronize(backend); | ||||
| } | ||||
|  | ||||
| void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { | ||||
|     ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); | ||||
|     ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor)); | ||||
|     ggml_backend_t backend = ggml_get_backend(tensor); | ||||
|  | ||||
|     GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); | ||||
|     GGML_ASSERT(backend != NULL && "tensor backend not set"); | ||||
|  | ||||
|     backend->iface.get_tensor_async(backend, tensor, data, offset, size); | ||||
|     backend->iface.synchronize(backend); | ||||
| } | ||||
|  | ||||
| void ggml_backend_synchronize(ggml_backend_t backend) { | ||||
| @@ -156,7 +186,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst | ||||
|     //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]); | ||||
|     GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); | ||||
|  | ||||
|     // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src)); | ||||
|     // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src)); | ||||
|  | ||||
|     if (src == dst) { | ||||
|         return; | ||||
| @@ -234,6 +264,8 @@ static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backen | ||||
|     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned | ||||
|     void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? | ||||
|  | ||||
|     GGML_ASSERT(data != NULL && "failed to allocate buffer"); | ||||
|  | ||||
|     return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size); | ||||
| } | ||||
|  | ||||
| @@ -271,8 +303,7 @@ static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml | ||||
| } | ||||
|  | ||||
| static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { | ||||
|     // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends | ||||
|     ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); | ||||
|     ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); | ||||
|  | ||||
|     UNUSED(backend); | ||||
| } | ||||
| @@ -383,3 +414,537 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { | ||||
| ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) { | ||||
|     return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size); | ||||
| } | ||||
|  | ||||
| // scheduler | ||||
|  | ||||
| #define GGML_MAX_BACKENDS 4 | ||||
| #define GGML_MAX_SPLITS 256 | ||||
| #define GGML_MAX_SPLIT_INPUTS 16 | ||||
|  | ||||
| struct ggml_backend_sched_split { | ||||
|     ggml_tallocr_t tallocr; | ||||
|     int i_start; | ||||
|     int i_end; | ||||
|     struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS]; | ||||
|     int n_inputs; | ||||
|     struct ggml_cgraph * graph; | ||||
| }; | ||||
|  | ||||
| struct ggml_backend_sched { | ||||
|     int n_backends; | ||||
|     ggml_backend_t backends[GGML_MAX_BACKENDS]; | ||||
|     ggml_tallocr_t  tallocs[GGML_MAX_BACKENDS]; | ||||
|  | ||||
|     ggml_gallocr_t galloc; | ||||
|  | ||||
|     struct ggml_hash_set    hash_set; | ||||
|     ggml_tallocr_t *        node_talloc;                     // [hash_set.size] | ||||
|     struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS] | ||||
|  | ||||
|     struct ggml_cgraph * graph; | ||||
|     struct ggml_backend_sched_split splits[GGML_MAX_SPLITS]; | ||||
|     int n_splits; | ||||
|  | ||||
|     struct ggml_context * ctx; | ||||
|  | ||||
|     // align context_buffer to GGML_MEM_ALIGN | ||||
|     #ifdef _MSC_VER | ||||
|     __declspec(align(GGML_MEM_ALIGN)) | ||||
|     #else | ||||
|     __attribute__((aligned(GGML_MEM_ALIGN))) | ||||
|     #endif | ||||
|     char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + GGML_MAX_SPLITS*sizeof(struct ggml_cgraph)]; | ||||
| }; | ||||
|  | ||||
| #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node) | ||||
| #define node_allocr(node) sched->node_talloc[hash_id(node)] | ||||
|  | ||||
| static bool ggml_is_view_op(enum ggml_op op) { | ||||
|     return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE; | ||||
| } | ||||
|  | ||||
| // returns the priority of the backend, lower is better | ||||
| static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) { | ||||
|     for (int i = 0; i < sched->n_backends; i++) { | ||||
|         if (sched->backends[i] == backend) { | ||||
|             return i; | ||||
|         } | ||||
|     } | ||||
|     return INT_MAX; | ||||
| } | ||||
|  | ||||
| static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) { | ||||
|     for (int i = 0; i < sched->n_backends; i++) { | ||||
|         if (sched->tallocs[i] == allocr) { | ||||
|             return i; | ||||
|         } | ||||
|     } | ||||
|     return INT_MAX; | ||||
| } | ||||
|  | ||||
| // returns the backend that should be used for the node based on the current locations | ||||
| char causes[GGML_DEFAULT_GRAPH_SIZE*4 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove | ||||
| static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) { | ||||
|     // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there | ||||
|     // ie. kv cache updates | ||||
|     // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend. | ||||
|     // dst | ||||
|     ggml_backend_t cur_backend = ggml_get_backend(node); | ||||
|     if (cur_backend != NULL) { | ||||
|         sprintf(causes[hash_id(node)], "1.dst"); | ||||
|         return cur_backend; | ||||
|     } | ||||
|  | ||||
|     // view_src | ||||
|     if (node->view_src != NULL && ggml_get_backend(node->view_src) != NULL) { | ||||
|         sprintf(causes[hash_id(node)], "1.vsrc"); | ||||
|         return ggml_get_backend(node->view_src); | ||||
|     } | ||||
|  | ||||
|     // src | ||||
|     int cur_prio = INT_MAX; | ||||
|     size_t cur_size = 0; | ||||
|  | ||||
|     for (int i = 0; i < GGML_MAX_SRC; i++) { | ||||
|         const struct ggml_tensor * src = node->src[i]; | ||||
|         if (src == NULL) { | ||||
|             break; | ||||
|         } | ||||
|         ggml_backend_t src_backend = ggml_get_backend(src); | ||||
|         if (src_backend != NULL) { | ||||
|             int src_prio = sched_backend_prio(sched, src_backend); | ||||
|             size_t src_size = ggml_nbytes(src); | ||||
|             if (src_prio < cur_prio && src_size >= cur_size) { | ||||
|                 cur_prio = src_prio; | ||||
|                 cur_size = src_size; | ||||
|                 cur_backend = src_backend; | ||||
|                 sprintf(causes[hash_id(node)], "1.src%d", i); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     return cur_backend; | ||||
| } | ||||
|  | ||||
| static char * fmt_size(size_t size) { | ||||
|     static char buffer[128]; | ||||
|     if (size >= 1024*1024) { | ||||
|         sprintf(buffer, "%zuM", size/1024/1024); | ||||
|     } else { | ||||
|         sprintf(buffer, "%zuK", size/1024); | ||||
|     } | ||||
|     return buffer; | ||||
| } | ||||
|  | ||||
| static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { | ||||
|     int cur_split = 0; | ||||
|     for (int i = 0; i < graph->n_nodes; i++) { | ||||
|         if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { | ||||
|             ggml_backend_t split_backend = ggml_tallocr_get_buffer(sched->splits[cur_split].tallocr)->backend; | ||||
|             fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs); | ||||
|             for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { | ||||
|                 fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); | ||||
|             } | ||||
|             fprintf(stderr, "\n"); | ||||
|             cur_split++; | ||||
|         } | ||||
|         struct ggml_tensor * node = graph->nodes[i]; | ||||
|         if (ggml_is_view_op(node->op)) { | ||||
|             continue; | ||||
|         } | ||||
|         ggml_tallocr_t node_allocr = node_allocr(node); | ||||
|         ggml_backend_t node_backend = node_allocr ? ggml_tallocr_get_buffer(node_allocr)->backend : NULL; | ||||
|         fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", causes[hash_id(node)]); | ||||
|         for (int j = 0; j < GGML_MAX_SRC; j++) { | ||||
|             struct ggml_tensor * src = node->src[j]; | ||||
|             if (src == NULL) { | ||||
|                 break; | ||||
|             } | ||||
|             ggml_tallocr_t src_allocr = node_allocr(src); | ||||
|             ggml_backend_t src_backend = src_allocr ? ggml_tallocr_get_buffer(src_allocr)->backend : NULL; | ||||
|             fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", causes[hash_id(src)]); | ||||
|         } | ||||
|         fprintf(stderr, "\n"); | ||||
|     } | ||||
| } | ||||
|  | ||||
| // creates a copy of the tensor with the same memory layout | ||||
| static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) { | ||||
|     struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor); | ||||
|     for (int i = 0; i < GGML_MAX_DIMS; i++) { | ||||
|         dup->nb[i] = tensor->nb[i]; | ||||
|     } | ||||
|     return dup; | ||||
| } | ||||
|  | ||||
| // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend | ||||
| // TODO: merge passes | ||||
| static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { | ||||
|     // reset state | ||||
|     size_t hash_size = sched->hash_set.size; | ||||
|     memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); | ||||
|     memset(sched->node_talloc,   0, sizeof(sched->node_talloc[0])   * hash_size); | ||||
|     memset(sched->node_copies,   0, sizeof(sched->node_copies[0])   * hash_size); | ||||
|     sched->n_splits = 0; | ||||
|  | ||||
|     struct ggml_init_params params = { | ||||
|         /*.mem_size =   */ sizeof(sched->context_buffer), | ||||
|         /*.mem_buffer = */ sched->context_buffer, | ||||
|         /*.no_alloc =   */ true | ||||
|     }; | ||||
|  | ||||
|     if (sched->ctx != NULL) { | ||||
|         ggml_free(sched->ctx); | ||||
|     } | ||||
|  | ||||
|     sched->ctx = ggml_init(params); | ||||
|  | ||||
|     // pass 1: assign backends to ops with allocated inputs | ||||
|     for (int i = 0; i < graph->n_leafs; i++) { | ||||
|         struct ggml_tensor * leaf = graph->leafs[i]; | ||||
|         if (node_allocr(leaf) != NULL) { | ||||
|             // do not overwrite user assignments | ||||
|             continue; | ||||
|         } | ||||
|         ggml_backend_t leaf_backend = ggml_get_backend(leaf); | ||||
|         if (leaf_backend == NULL && leaf->view_src != NULL) { | ||||
|             leaf_backend = ggml_get_backend(leaf->view_src); | ||||
|         } | ||||
|         if (leaf_backend != NULL) { | ||||
|             node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     for (int i = 0; i < graph->n_nodes; i++) { | ||||
|         struct ggml_tensor * node = graph->nodes[i]; | ||||
|         if (node_allocr(node) != NULL) { | ||||
|             // do not overwrite user assignments | ||||
|             continue; | ||||
|         } | ||||
|         ggml_backend_t node_backend = sched_backend_from_cur(sched, node); | ||||
|         if (node_backend != NULL) { | ||||
|             node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend); | ||||
|         } | ||||
|     } | ||||
|     //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); | ||||
|  | ||||
|     // pass 2: assign backends to ops from current assignments | ||||
|     // TODO: | ||||
|     //  - reuse sched_backend_from_cur | ||||
|     for (int i = 0; i < graph->n_nodes; i++) { | ||||
|         struct ggml_tensor * node = graph->nodes[i]; | ||||
|         ggml_tallocr_t node_allocr = node_allocr(node); | ||||
|         if (node_allocr == NULL) { | ||||
|             int    cur_prio = INT_MAX; | ||||
|             size_t cur_size = 0; | ||||
|             for (int j = 0; j < GGML_MAX_SRC; j++) { | ||||
|                 struct ggml_tensor * src = node->src[j]; | ||||
|                 if (src == NULL) { | ||||
|                     break; | ||||
|                 } | ||||
|                 ggml_tallocr_t src_allocr = node_allocr(src); | ||||
|                 if (src_allocr != NULL) { | ||||
|                     int    src_prio = sched_allocr_prio(sched, src_allocr); | ||||
|                     size_t src_size = ggml_nbytes(src); | ||||
|                     if (src_prio < cur_prio && src_size >= cur_size) { | ||||
|                         cur_prio = src_prio; | ||||
|                         cur_size = src_size; | ||||
|                         node_allocr = src_allocr; | ||||
|                         sprintf(causes[hash_id(node)], "2.src%d", j); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             if (node_allocr != NULL) { | ||||
|                 node_allocr(node) = node_allocr; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); | ||||
|  | ||||
|     // pass 3: assign backends to remaining src from dst (should only be leafs) | ||||
|     for (int i = 0; i < graph->n_nodes; i++) { | ||||
|         struct ggml_tensor * node = graph->nodes[i]; | ||||
|         ggml_tallocr_t node_allocr = node_allocr(node); | ||||
|         for (int j = 0; j < GGML_MAX_SRC; j++) { | ||||
|             struct ggml_tensor * src = node->src[j]; | ||||
|             if (src == NULL) { | ||||
|                 break; | ||||
|             } | ||||
|             ggml_tallocr_t src_allocr = node_allocr(src); | ||||
|             if (src_allocr == NULL) { | ||||
|                 node_allocr(src) = node_allocr; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); | ||||
|  | ||||
|     // pass 4: split graph, find tensors that need to be copied | ||||
|     // TODO: | ||||
|     //  - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost | ||||
|     // find first backend | ||||
|     int cur_split = 0; | ||||
|     for (int i = 0; i < graph->n_nodes; i++) { | ||||
|         struct ggml_tensor * node = graph->nodes[i]; | ||||
|         if (node->view_src == NULL) { | ||||
|             sched->splits[0].tallocr = node_allocr(node); | ||||
|             break; | ||||
|         } | ||||
|     } | ||||
|     sched->splits[0].i_start = 0; | ||||
|     sched->splits[0].n_inputs = 0; | ||||
|     memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK | ||||
|     ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; | ||||
|     size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); | ||||
|     for (int i = 0; i < graph->n_nodes; i++) { | ||||
|         struct ggml_tensor * node = graph->nodes[i]; | ||||
|  | ||||
|         if (ggml_is_view_op(node->op)) { | ||||
|             continue; | ||||
|         } | ||||
|  | ||||
|         ggml_tallocr_t node_allocr = node_allocr(node); | ||||
|  | ||||
|         if (node_allocr != cur_allocr) { | ||||
|             sched->splits[cur_split].i_end = i; | ||||
|             cur_split++; | ||||
|             GGML_ASSERT(cur_split < GGML_MAX_SPLITS); | ||||
|             sched->splits[cur_split].tallocr = node_allocr; | ||||
|             sched->splits[cur_split].i_start = i; | ||||
|             sched->splits[cur_split].n_inputs = 0; | ||||
|             memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK | ||||
|             cur_allocr = node_allocr; | ||||
|             cur_backend_id = sched_allocr_prio(sched, cur_allocr); | ||||
|         } | ||||
|  | ||||
|         // find inputs that are not on the same backend | ||||
|         for (int j = 0; j < GGML_MAX_SRC; j++) { | ||||
|             struct ggml_tensor * src = node->src[j]; | ||||
|             if (src == NULL) { | ||||
|                 break; | ||||
|             } | ||||
|             ggml_tallocr_t src_allocr = node_allocr(src); | ||||
|             if (src_allocr != node_allocr) { | ||||
|                 int n_inputs = sched->splits[cur_split].n_inputs++; | ||||
|                 GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); | ||||
|                 sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src; | ||||
|  | ||||
|                 // create copies | ||||
|                 size_t id = hash_id(src); | ||||
|                 if (sched->node_copies[id][cur_backend_id] == NULL) { | ||||
|                     struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); | ||||
|                     sched->node_copies[id][cur_backend_id] = tensor_copy; | ||||
|                     node_allocr(tensor_copy) = cur_allocr; | ||||
|                     ggml_backend_t backend = ggml_tallocr_get_buffer(cur_allocr)->backend; | ||||
|                     ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); | ||||
|                 } | ||||
|                 node->src[j] = sched->node_copies[id][cur_backend_id]; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     sched->splits[cur_split].i_end = graph->n_nodes; | ||||
|     sched->n_splits = cur_split + 1; | ||||
|  | ||||
|     //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout); | ||||
|  | ||||
| #if 1 | ||||
|     // sanity check: all sources should have the same backend as the node | ||||
|     for (int i = 0; i < graph->n_nodes; i++) { | ||||
|         struct ggml_tensor * node = graph->nodes[i]; | ||||
|         ggml_tallocr_t node_allocr = node_allocr(node); | ||||
|         if (node_allocr == NULL) { | ||||
|             fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); | ||||
|         } | ||||
|         for (int j = 0; j < GGML_MAX_SRC; j++) { | ||||
|             struct ggml_tensor * src = node->src[j]; | ||||
|             if (src == NULL) { | ||||
|                 break; | ||||
|             } | ||||
|             ggml_tallocr_t src_allocr = node_allocr(src); | ||||
|             if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now | ||||
|                 fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n", | ||||
|                     node->name, node_allocr ? ggml_backend_name(ggml_tallocr_get_buffer(node_allocr)->backend) : "NULL", | ||||
|                     j, src->name, src_allocr ? ggml_backend_name(ggml_tallocr_get_buffer(src_allocr)->backend) : "NULL"); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     // create copies of the graph for each split | ||||
|     // FIXME: avoid this copy, pass split inputs to ggml_gallocr_alloc_graph_n in some other way | ||||
|     struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_MAX_SPLIT_INPUTS, false); | ||||
|     for (int i = 0; i < sched->n_splits; i++) { | ||||
|         struct ggml_backend_sched_split * split = &sched->splits[i]; | ||||
|         split->graph = ggml_graph_view(sched->ctx, graph, split->i_start, split->i_end); | ||||
|  | ||||
|         // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split | ||||
|         for (int j = 0; j < split->n_inputs; j++) { | ||||
|             struct ggml_tensor * input = split->inputs[j]; | ||||
|             struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)]; | ||||
|             input_cpy->src[0] = input; | ||||
|             graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; | ||||
|         } | ||||
|  | ||||
|         for (int j = split->i_start; j < split->i_end; j++) { | ||||
|             graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; | ||||
|         } | ||||
|     } | ||||
|     sched->graph = graph_copy; | ||||
| } | ||||
|  | ||||
| static void sched_alloc_splits(ggml_backend_sched_t sched) { | ||||
|     ggml_gallocr_alloc_graph_n( | ||||
|         sched->galloc, | ||||
|         sched->graph, | ||||
|         sched->hash_set, | ||||
|         sched->node_talloc); | ||||
| } | ||||
|  | ||||
| static void sched_compute_splits(ggml_backend_sched_t sched) { | ||||
|     uint64_t copy_us[GGML_MAX_BACKENDS] = {0}; | ||||
|     uint64_t compute_us[GGML_MAX_BACKENDS] = {0}; | ||||
|  | ||||
|     struct ggml_backend_sched_split * splits = sched->splits; | ||||
|  | ||||
|     for (int i = 0; i < sched->n_splits; i++) { | ||||
|         struct ggml_backend_sched_split * split = &splits[i]; | ||||
|         ggml_backend_t split_backend = ggml_tallocr_get_buffer(split->tallocr)->backend; | ||||
|         int split_backend_id = sched_backend_prio(sched, split_backend); | ||||
|  | ||||
|         // copy the input tensors to the split backend | ||||
|         uint64_t copy_start_us = ggml_time_us(); | ||||
|         for (int j = 0; j < split->n_inputs; j++) { | ||||
|             struct ggml_tensor * input_cpy = sched->node_copies[hash_id(split->inputs[j])][sched_backend_prio(sched, split_backend)]; | ||||
|             if (split->inputs[j]->buffer == NULL) { | ||||
|                 if (split->inputs[j]->view_src == NULL) { | ||||
|                     fprintf(stderr, "input %s has no buffer and no view_src\n", split->inputs[j]->name); | ||||
|                     exit(1); | ||||
|                 } | ||||
|                 struct ggml_tensor * view = split->inputs[j]; | ||||
|                 view->backend = view->view_src->backend; | ||||
|                 view->buffer  = view->view_src->buffer; | ||||
|                 view->data    = (char *)view->view_src->data + view->view_offs; | ||||
|                 ggml_backend_buffer_init_tensor(ggml_backend_sched_get_buffer(sched, view->buffer->backend), view); | ||||
|             } | ||||
|             if (input_cpy->buffer == NULL) { | ||||
|                 fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name); | ||||
|                 exit(1); | ||||
|             } | ||||
|             GGML_ASSERT(split->inputs[j]->buffer->backend != input_cpy->buffer->backend); | ||||
|             GGML_ASSERT(input_cpy->buffer->backend == split_backend); | ||||
|             ggml_backend_tensor_copy(split->inputs[j], input_cpy); | ||||
|         } | ||||
|         // ggml_backend_synchronize(split_backend); | ||||
|         int64_t copy_end_us = ggml_time_us(); | ||||
|         copy_us[split_backend_id] += copy_end_us - copy_start_us; | ||||
|  | ||||
| #if 0 | ||||
|         char split_filename[GGML_MAX_NAME]; | ||||
|         snprintf(split_filename, GGML_MAX_NAME, "split_%i_%s.dot", i, ggml_backend_name(split_backend)); | ||||
|         ggml_graph_dump_dot(split->graph, NULL, split_filename); | ||||
| #endif | ||||
|  | ||||
|         uint64_t compute_start_us = ggml_time_us(); | ||||
|         ggml_backend_graph_compute(split_backend, split->graph); | ||||
|         // ggml_backend_synchronize(split_backend); | ||||
|         uint64_t compute_end_us = ggml_time_us(); | ||||
|         compute_us[split_backend_id] += compute_end_us - compute_start_us; | ||||
|     } | ||||
|  | ||||
| #if 0 | ||||
|     // per-backend timings | ||||
|     fprintf(stderr, "sched_compute_splits times (%d splits):\n", sched->n_splits); | ||||
|     for (int i = 0; i < sched->n_backends; i++) { | ||||
|         if (copy_us[i] > 0 || compute_us[i] > 0) { | ||||
|             fprintf(stderr, "\t%5.5s: %lu us copy, %lu us compute\n", ggml_backend_name(sched->backends[i]), copy_us[i], compute_us[i]); | ||||
|         } | ||||
|     } | ||||
| #endif | ||||
| } | ||||
|  | ||||
| static void sched_reset(ggml_backend_sched_t sched) { | ||||
|     for (int i = 0; i < sched->n_backends; i++) { | ||||
|         ggml_tallocr_reset(sched->tallocs[i]); | ||||
|     } | ||||
| } | ||||
|  | ||||
| ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) { | ||||
|     GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS); | ||||
|  | ||||
|     struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched)); | ||||
|     memset(sched, 0, sizeof(struct ggml_backend_sched)); | ||||
|  | ||||
|     fprintf(stderr, "ggml_backend_sched size: %lu KB\n", sizeof(struct ggml_backend_sched)/1024); | ||||
|  | ||||
|     sched->n_backends = n_backends; | ||||
|     for (int i = 0; i < n_backends; i++) { | ||||
|         sched->backends[i] = backends[i]; | ||||
|     } | ||||
|  | ||||
|     sched->galloc = ggml_gallocr_new(); | ||||
|  | ||||
|     // init measure allocs for each backend | ||||
|     for (int i = 0; i < n_backends; i++) { | ||||
|         sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]); | ||||
|     } | ||||
|  | ||||
|     return sched; | ||||
| } | ||||
|  | ||||
| void ggml_backend_sched_free(ggml_backend_sched_t sched) { | ||||
|     if (sched == NULL) { | ||||
|         return; | ||||
|     } | ||||
|     for (int i = 0; i < sched->n_backends; i++) { | ||||
|         ggml_tallocr_free(sched->tallocs[i]); | ||||
|     } | ||||
|     ggml_gallocr_free(sched->galloc); | ||||
|     free(sched->hash_set.keys); | ||||
|     free(sched->node_talloc); | ||||
|     free(sched->node_copies); | ||||
|     free(sched); | ||||
| } | ||||
|  | ||||
| void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { | ||||
|     // initialize hash tables | ||||
|     size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS; | ||||
|     sched->hash_set.size = hash_size; | ||||
|     sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size); | ||||
|     sched->node_talloc   = malloc(sizeof(sched->node_talloc[0])   * hash_size); | ||||
|     sched->node_copies   = malloc(sizeof(sched->node_copies[0])   * hash_size); | ||||
|  | ||||
|     sched_split_graph(sched, measure_graph); | ||||
|     sched_alloc_splits(sched); | ||||
|  | ||||
|     // allocate buffers and reset allocators | ||||
|     for (int i = 0; i < sched->n_backends; i++) { | ||||
|         size_t size = ggml_tallocr_max_size(sched->tallocs[i]); | ||||
|         ggml_tallocr_free(sched->tallocs[i]); | ||||
|         sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size); | ||||
|     } | ||||
|  | ||||
|     sched_reset(sched); | ||||
| } | ||||
|  | ||||
| void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { | ||||
|     GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); | ||||
|  | ||||
|     sched_split_graph(sched, graph); | ||||
|     sched_alloc_splits(sched); | ||||
|     sched_compute_splits(sched); | ||||
|     sched_reset(sched); | ||||
| } | ||||
|  | ||||
| ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) { | ||||
|     int backend_index = sched_backend_prio(sched, backend); | ||||
|     return sched->tallocs[backend_index]; | ||||
| } | ||||
|  | ||||
| ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) { | ||||
|     int backend_index = sched_backend_prio(sched, backend); | ||||
|     return ggml_tallocr_get_buffer(sched->tallocs[backend_index]); | ||||
| } | ||||
|  | ||||
| void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { | ||||
|     int backend_index = sched_backend_prio(sched, backend); | ||||
|     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); | ||||
|     node_allocr(node) = sched->tallocs[backend_index]; | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov