mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	sched : fix multiple evaluations of the same graph with pipeline parallelism (#14855)
ggml-ci
This commit is contained in:
		| @@ -647,6 +647,7 @@ struct ggml_backend_sched { | ||||
|     // pipeline parallelism support | ||||
|     int n_copies; | ||||
|     int cur_copy; | ||||
|     int next_copy; | ||||
|     ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES]; | ||||
|     struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS]; | ||||
|     int n_graph_inputs; | ||||
| @@ -1433,8 +1434,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies; | ||||
|  | ||||
|     return GGML_STATUS_SUCCESS; | ||||
| } | ||||
|  | ||||
| @@ -1535,10 +1534,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { | ||||
| bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { | ||||
|     GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); | ||||
|  | ||||
|     ggml_backend_sched_split_graph(sched, measure_graph); | ||||
|  | ||||
|     ggml_backend_sched_synchronize(sched); | ||||
|  | ||||
|     ggml_backend_sched_split_graph(sched, measure_graph); | ||||
|  | ||||
|     if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { | ||||
|         return false; | ||||
|     } | ||||
| @@ -1550,6 +1549,10 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * | ||||
|  | ||||
| bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { | ||||
|     GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs); | ||||
|     GGML_ASSERT(!sched->is_alloc); | ||||
|  | ||||
|     sched->cur_copy = sched->next_copy; | ||||
|     sched->next_copy = (sched->next_copy + 1) % sched->n_copies; | ||||
|  | ||||
|     ggml_backend_sched_split_graph(sched, graph); | ||||
|  | ||||
| @@ -1590,7 +1593,7 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) { | ||||
|         // if the graph is not already allocated, always use copy 0 after a synchronization | ||||
|         // this ensures that during generation the same copy is used every time, | ||||
|         // which avoids changes in the graph that could cause CUDA or other graphs to be disabled | ||||
|         sched->cur_copy = 0; | ||||
|         sched->next_copy = 0; | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Diego Devesa
					Diego Devesa