mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	further addressed comments
This commit is contained in:
		
							
								
								
									
										14
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							| @@ -2460,7 +2460,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t | |||||||
|         int k=0; |         int k=0; | ||||||
|         for (int i = 0; i < cgraph->n_nodes; i++) { |         for (int i = 0; i < cgraph->n_nodes; i++) { | ||||||
|             ggml_tensor * node = cgraph->nodes[i]; |             ggml_tensor * node = cgraph->nodes[i]; | ||||||
|             // Identify if the graph needs updated for this token due to the number of elements changing |             // Identify if the graph needs to be updated for this token due to the number of elements changing | ||||||
|             // (identified by inspecting soft max op parameters) |             // (identified by inspecting soft max op parameters) | ||||||
|             if(node->op == GGML_OP_SOFT_MAX) { |             if(node->op == GGML_OP_SOFT_MAX) { | ||||||
|                 if(node->src[1]->ne[1] > 1){ |                 if(node->src[1]->ne[1] > 1){ | ||||||
| @@ -2489,10 +2489,10 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t | |||||||
| #else | #else | ||||||
|     bool use_cuda_graph = false; |     bool use_cuda_graph = false; | ||||||
|     bool cuda_graph_update_required = false; |     bool cuda_graph_update_required = false; | ||||||
| #endif | #endif // USE_CUDA_GRAPH | ||||||
|      |      | ||||||
|     // Only perfom the graph exection if CUDA graphs are not enebled, or we are capturing the graph. |     // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. | ||||||
|     // With use of CUDA graphs, the execution will be performed by the graph launch. |     // With the use of CUDA graphs, the execution will be performed by the graph launch. | ||||||
|     if(!use_cuda_graph || cuda_graph_update_required) { |     if(!use_cuda_graph || cuda_graph_update_required) { | ||||||
|         //temporarily avoid indenting here to make code review easier |         //temporarily avoid indenting here to make code review easier | ||||||
|     for (int i = 0; i < cgraph->n_nodes; i++) { |     for (int i = 0; i < cgraph->n_nodes; i++) { | ||||||
| @@ -2519,7 +2519,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t | |||||||
|     } |     } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #ifdef USE_CUDA_GRAPH | #ifdef USE_CUDA_GRAPH | ||||||
|     if(use_cuda_graph && (cuda_graph_update_required)) { // End CUDA graph capture |     if(use_cuda_graph && (cuda_graph_update_required)) { // End CUDA graph capture | ||||||
|         CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_graph.graph)); |         CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_graph.graph)); | ||||||
|     } |     } | ||||||
| @@ -2541,7 +2541,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t | |||||||
|             // Subsequent call with non-null argument gets nodes |             // Subsequent call with non-null argument gets nodes | ||||||
|             CUDA_CHECK(cudaGraphGetNodes(cuda_graph.graph, cuda_graph.nodes, &cuda_graph.num_nodes)); |             CUDA_CHECK(cudaGraphGetNodes(cuda_graph.graph, cuda_graph.nodes, &cuda_graph.num_nodes)); | ||||||
|  |  | ||||||
|             // Loop over nodes, and extract kernel parameters fro each node |             // Loop over nodes, and extract kernel parameters from each node | ||||||
|             for(size_t i=0; i<cuda_graph.num_nodes; i++) { |             for(size_t i=0; i<cuda_graph.num_nodes; i++) { | ||||||
|                 cudaGraphNodeType node_type; |                 cudaGraphNodeType node_type; | ||||||
|                 CUDA_CHECK(cudaGraphNodeGetType(cuda_graph.nodes[i], &node_type)); |                 CUDA_CHECK(cudaGraphNodeGetType(cuda_graph.nodes[i], &node_type)); | ||||||
| @@ -2588,7 +2588,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t | |||||||
|         CUDA_CHECK(cudaGraphLaunch(cuda_graph.instance, cuda_ctx->stream())); |         CUDA_CHECK(cudaGraphLaunch(cuda_graph.instance, cuda_ctx->stream())); | ||||||
|     } |     } | ||||||
|     cuda_graph.count++; |     cuda_graph.count++; | ||||||
| #endif | #endif // USE_CUDA_GRAPH | ||||||
|     return GGML_STATUS_SUCCESS; |     return GGML_STATUS_SUCCESS; | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Alan Gray
					Alan Gray