mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	cuda : fix defrag with quantized KV (#9319)
This commit is contained in:
		| @@ -1165,6 +1165,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) { | ||||
|         // since the tensor is pre-allocated, it cannot be moved to another backend | ||||
|         GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation"); | ||||
|     } | ||||
|  | ||||
|     // graph input | ||||
|     if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { | ||||
|         cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU) | ||||
| @@ -1644,7 +1649,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg | ||||
|         sched->prev_leaf_backend_ids = tmp; | ||||
|     } | ||||
|  | ||||
|     int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; | ||||
|     int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies; | ||||
|     if (sched->graph.size < graph_size) { | ||||
|         sched->graph.size = graph_size; | ||||
|         sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); | ||||
| @@ -1696,6 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg | ||||
|             for (int c = 0; c < sched->n_copies; c++) { | ||||
|                 struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); | ||||
|                 sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; | ||||
|                 assert(graph_copy->size > graph_copy->n_leafs); | ||||
|                 graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; | ||||
|             } | ||||
|         } | ||||
| @@ -1709,6 +1715,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg | ||||
|                 for (int c = 0; c < sched->n_copies; c++) { | ||||
|                     struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); | ||||
|                     sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; | ||||
|                     assert(graph_copy->size > graph_copy->n_leafs); | ||||
|                     graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; | ||||
|                 } | ||||
|             } | ||||
| @@ -1719,6 +1726,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg | ||||
|     for (int i = 0; i < graph->n_leafs; i++) { | ||||
|         struct ggml_tensor * leaf = graph->leafs[i]; | ||||
|         sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf); | ||||
|         assert(graph_copy->size > graph_copy->n_leafs); | ||||
|         graph_copy->leafs[graph_copy->n_leafs++] = leaf; | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 slaren
					slaren