mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Added more comprehensive graph node checking
This commit is contained in:
		
							
								
								
									
										68
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							
							
						
						
									
										68
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							| @@ -2409,6 +2409,14 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { | |||||||
| #define USE_CUDA_GRAPH | #define USE_CUDA_GRAPH | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | struct ggml_graph_node_properties { | ||||||
|  |     void * node_address; | ||||||
|  |     int node_op; | ||||||
|  |     int64_t ne[GGML_MAX_DIMS]; | ||||||
|  |     size_t nb[GGML_MAX_DIMS]; | ||||||
|  |     void * src_address[GGML_MAX_SRC]; | ||||||
|  | }; | ||||||
|  |  | ||||||
| #ifdef USE_CUDA_GRAPH | #ifdef USE_CUDA_GRAPH | ||||||
| #define MAX_NODES_IN_CUDA_GRAPH 10000 | #define MAX_NODES_IN_CUDA_GRAPH 10000 | ||||||
| struct ggml_cuda_graph { | struct ggml_cuda_graph { | ||||||
| @@ -2416,15 +2424,42 @@ struct ggml_cuda_graph { | |||||||
|     cudaGraph_t graph = nullptr; |     cudaGraph_t graph = nullptr; | ||||||
|     cudaGraphExec_t instance = nullptr; |     cudaGraphExec_t instance = nullptr; | ||||||
|     size_t num_nodes = 0; |     size_t num_nodes = 0; | ||||||
|     int softmax_ne0 = 0; |  | ||||||
|     cudaGraphNode_t nodes[MAX_NODES_IN_CUDA_GRAPH]; |     cudaGraphNode_t nodes[MAX_NODES_IN_CUDA_GRAPH]; | ||||||
|     cudaKernelNodeParams params[MAX_NODES_IN_CUDA_GRAPH]; |     cudaKernelNodeParams params[MAX_NODES_IN_CUDA_GRAPH]; | ||||||
|     bool disable_due_to_gpu_arch = false; |     bool disable_due_to_gpu_arch = false; | ||||||
|  |     bool disable_due_to_too_many_updates = false; | ||||||
|  |     int number_consecutive_updates = 0; | ||||||
|  |     ggml_graph_node_properties ggml_graph_properties[MAX_NODES_IN_CUDA_GRAPH]; | ||||||
| }; | }; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| const bool disable_cuda_graphs = (getenv("LLAMACPP_DISABLE_CUDA_GRAPHS") != nullptr); | const bool disable_cuda_graphs = (getenv("LLAMACPP_DISABLE_CUDA_GRAPHS") != nullptr); | ||||||
|  |  | ||||||
|  | GGML_CALL static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) { | ||||||
|  |     graph_node_properties->node_address = node; | ||||||
|  |     graph_node_properties->node_op = node->op; | ||||||
|  |     for(int i=0; i<GGML_MAX_DIMS; i++) { | ||||||
|  |         graph_node_properties->ne[i] = node->ne[i]; | ||||||
|  |         graph_node_properties->nb[i] = node->nb[i]; | ||||||
|  |     } | ||||||
|  |     for(int i=0; i<GGML_MAX_SRC; i++) { | ||||||
|  |         graph_node_properties->src_address[i] = node->src[i]; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | GGML_CALL static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) { | ||||||
|  |     if(node != graph_node_properties->node_address) return false; | ||||||
|  |     if(node->op != graph_node_properties->node_op) return false; | ||||||
|  |     for(int i=0; i<GGML_MAX_DIMS; i++) { | ||||||
|  |         if(node->ne[i] != graph_node_properties->ne[i]) return false; | ||||||
|  |         if(node->nb[i] != graph_node_properties->nb[i]) return false; | ||||||
|  |     } | ||||||
|  |     for(int i=0; i<GGML_MAX_SRC; i++) { | ||||||
|  |         if(node->src[i] != graph_node_properties->src_address[i]) return false; | ||||||
|  |     } | ||||||
|  |     return true; | ||||||
|  | } | ||||||
|  |  | ||||||
| GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { | GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { | ||||||
|     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; |     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; | ||||||
|  |  | ||||||
| @@ -2446,9 +2481,10 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Disable CUDA graphs in presence of env var or old GPU. |     // Disable CUDA graphs in presence of env var, old GPU or use-case which is changing too rapidly. | ||||||
|     // Also disable for multi-gpu for now. TO DO investigate |     // Also disable for multi-gpu for now. TO DO investigate | ||||||
|     if(disable_cuda_graphs || cuda_graph.disable_due_to_gpu_arch || ggml_backend_cuda_get_device_count() > 1){ |     if(disable_cuda_graphs || cuda_graph.disable_due_to_gpu_arch || cuda_graph.disable_due_to_too_many_updates || | ||||||
|  |        ggml_backend_cuda_get_device_count() > 1){ | ||||||
|         use_cuda_graph = false; |         use_cuda_graph = false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -2456,20 +2492,25 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t | |||||||
|  |  | ||||||
|         if(cuda_graph.instance == nullptr) cuda_graph_update_required=true; |         if(cuda_graph.instance == nullptr) cuda_graph_update_required=true; | ||||||
|  |  | ||||||
|  |         // Loop over nodes in GGML graph to determine if CUDA graph update is required | ||||||
|  |         // and store properties to allow this comparison for the next token | ||||||
|  |         for (int i = 0; i < cgraph->n_nodes; i++) { | ||||||
|  |             bool has_matching_properties = true; | ||||||
|  |             if(!cuda_graph_update_required) { | ||||||
|  |                 has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_graph.ggml_graph_properties[i]); | ||||||
|  |             } | ||||||
|  |             if(!has_matching_properties) cuda_graph_update_required = true; | ||||||
|  |             set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_graph.ggml_graph_properties[i]); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         // Loop over nodes in GGML graph to obtain info needed for CUDA graph |         // Loop over nodes in GGML graph to obtain info needed for CUDA graph | ||||||
|         int k=0; |         int k=0; | ||||||
|         for (int i = 0; i < cgraph->n_nodes; i++) { |         for (int i = 0; i < cgraph->n_nodes; i++) { | ||||||
|             ggml_tensor * node = cgraph->nodes[i]; |             ggml_tensor * node = cgraph->nodes[i]; | ||||||
|             // Identify if the graph needs to be updated for this token due to the number of elements changing |  | ||||||
|             // (identified by inspecting soft max op parameters) |  | ||||||
|             if(node->op == GGML_OP_SOFT_MAX) { |             if(node->op == GGML_OP_SOFT_MAX) { | ||||||
|                 if(node->src[1]->ne[1] > 1){ |                 if(node->src[1]->ne[1] > 1){ | ||||||
|                     use_cuda_graph = false; // disable CUDA graphs for batch size > 1 for now. TO DO investigate |                     use_cuda_graph = false; // disable CUDA graphs for batch size > 1 for now. TO DO investigate | ||||||
|                 } |                 } | ||||||
|                 if(node->src[0]->ne[0] != cuda_graph.softmax_ne0) { |  | ||||||
|                     cuda_graph_update_required = true; |  | ||||||
|                     cuda_graph.softmax_ne0 = node->src[0]->ne[0]; |  | ||||||
|                 } |  | ||||||
|             } |             } | ||||||
|             if(node->op == GGML_OP_CPY) { |             if(node->op == GGML_OP_CPY) { | ||||||
|                 // store the copy op parameter which changes with each token. |                 // store the copy op parameter which changes with each token. | ||||||
| @@ -2480,6 +2521,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t | |||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. | ||||||
|  |         if(cuda_graph_update_required) { | ||||||
|  |             cuda_graph.number_consecutive_updates++; | ||||||
|  |         } | ||||||
|  |         else { | ||||||
|  |             cuda_graph.number_consecutive_updates = 0; | ||||||
|  |         } | ||||||
|  |         if (cuda_graph.number_consecutive_updates >= 4) cuda_graph.disable_due_to_too_many_updates = true; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if(use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture |     if(use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Alan Gray
					Alan Gray