mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	DRAFT: Introduction of CUDA Graphs to LLama.cpp
This commit is contained in:
		
							
								
								
									
										112
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							
							
						
						
									
										112
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							| @@ -2405,11 +2405,63 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { | |||||||
|     GGML_UNUSED(backend); |     GGML_UNUSED(backend); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | struct ggml_cudaGraph { | ||||||
|  |     int count=0; | ||||||
|  |     cudaGraph_t graph = nullptr; | ||||||
|  |     cudaGraphExec_t instance = nullptr; | ||||||
|  |     size_t numNodes = 0; | ||||||
|  |     int softmax_ne0 = 0; | ||||||
|  | }; | ||||||
|  |  | ||||||
| GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { | GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { | ||||||
|     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; |     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; | ||||||
|  |  | ||||||
|     ggml_cuda_set_device(cuda_ctx->device); |     ggml_cuda_set_device(cuda_ctx->device); | ||||||
|  |  | ||||||
|  |     // Objects required for CUDA Graph | ||||||
|  | #define MAX_NODES_IN_CUDA_GRAPH 10000 | ||||||
|  |     static ggml_cudaGraph cudaGraph; //TO DO move this to a suitable persistant location (and avoid use of static memory) | ||||||
|  |     bool useCudaGraph = (cudaGraph.count>=2); //avoid CUDA graphs on first 2 steps due to incompatible initialisations. | ||||||
|  |     char** updatedKernelArg[MAX_NODES_IN_CUDA_GRAPH]; | ||||||
|  |     bool cudaGraphUpdateRequired = false; | ||||||
|  |     // pointer to CUDA cpy kernel, which is required to identify | ||||||
|  |     // kernel parameters which need updated in the graph for each token | ||||||
|  |     void* ggmlCudaCpyFn = nullptr; | ||||||
|  |     if(useCudaGraph) { | ||||||
|  |  | ||||||
|  |         if(cudaGraph.instance == nullptr) cudaGraphUpdateRequired=true; | ||||||
|  |  | ||||||
|  |         // Loop over nodes in GGML graph to obtain info needed for CUDA graph | ||||||
|  |         int k=0; | ||||||
|  |         for (int i = 0; i < cgraph->n_nodes; i++) { | ||||||
|  |             ggml_tensor * node = cgraph->nodes[i]; | ||||||
|  |             // Identify if the graph needs updated for this token due to the number of elements changing | ||||||
|  |             // (identified by inspecting soft max op parameters) | ||||||
|  |             if(node->op == GGML_OP_SOFT_MAX) { | ||||||
|  |                 if(node->src[0]->ne[0] != cudaGraph.softmax_ne0) { | ||||||
|  |                     cudaGraphUpdateRequired = true; | ||||||
|  |                     cudaGraph.softmax_ne0 = node->src[0]->ne[0]; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             if(node->op == GGML_OP_CPY) { | ||||||
|  |                 // store the copy op parameter which changes with each token. | ||||||
|  |                 updatedKernelArg[k++]=(char**) &(node->src[1]->data); | ||||||
|  |                 if(ggmlCudaCpyFn == nullptr){ | ||||||
|  |                     // store a pointer to the copy op CUDA kernel to identify it later | ||||||
|  |                     ggmlCudaCpyFn = ggml_cuda_cpy_fn(node->src[0], node->src[1]); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     if(useCudaGraph && cudaGraphUpdateRequired) { // Start CUDA graph capture | ||||||
|  |         CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeGlobal)); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Only perfom the graph exection if CUDA graphs are not enebled, or we are capturing the graph. | ||||||
|  |     // With use of CUDA graphs, the execution will be performed by the graph launch. | ||||||
|  |     if(!useCudaGraph || cudaGraphUpdateRequired) { | ||||||
|  |         //temporarily avoid indenting here to make code review easier | ||||||
|     for (int i = 0; i < cgraph->n_nodes; i++) { |     for (int i = 0; i < cgraph->n_nodes; i++) { | ||||||
|         ggml_tensor * node = cgraph->nodes[i]; |         ggml_tensor * node = cgraph->nodes[i]; | ||||||
|  |  | ||||||
| @@ -2432,7 +2484,67 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t | |||||||
|         } |         } | ||||||
|         GGML_ASSERT(ok); |         GGML_ASSERT(ok); | ||||||
|     } |     } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     if(useCudaGraph && (cudaGraphUpdateRequired)) { // End CUDA graph capture | ||||||
|  |         CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cudaGraph.graph)); | ||||||
|  |     } | ||||||
|  |     if(useCudaGraph){ | ||||||
|  |  | ||||||
|  |         if(cudaGraph.instance == nullptr) { // Create executable graph from captured graph. | ||||||
|  |             CUDA_CHECK(cudaGraphInstantiate(&cudaGraph.instance, cudaGraph.graph, NULL, NULL, 0)); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         // Perform update to graph (if required for this token), and change copy parameter (required for every token) | ||||||
|  |  | ||||||
|  |         cudaGraphNode_t nodes[MAX_NODES_IN_CUDA_GRAPH]; | ||||||
|  |         CUDA_KERNEL_NODE_PARAMS_v2 paramsDriver[MAX_NODES_IN_CUDA_GRAPH]; | ||||||
|  |         cudaKernelNodeParams paramsRuntime[MAX_NODES_IN_CUDA_GRAPH]; | ||||||
|  |  | ||||||
|  |         if(cudaGraphUpdateRequired) { | ||||||
|  |             // Extract nodes from graph | ||||||
|  |             if(cudaGraph.numNodes == 0) { | ||||||
|  |                 CUDA_CHECK(cudaGraphGetNodes(cudaGraph.graph, nullptr, &cudaGraph.numNodes)); | ||||||
|  |             } | ||||||
|  |             CUDA_CHECK(cudaGraphGetNodes(cudaGraph.graph, nodes, &cudaGraph.numNodes)); | ||||||
|  |  | ||||||
|  |             // Loop over nodes, and extract kernel parameters fro each node | ||||||
|  |             for(size_t i=0; i<cudaGraph.numNodes; i++) { | ||||||
|  |                 // We currently get a set of params using both driver and runtime, to work around an issue (see below) | ||||||
|  |                 CU_CHECK(cuGraphKernelNodeGetParams(nodes[i], ¶msDriver[i])); // Get params using driver | ||||||
|  |                 cudaError_t statRT = cudaGraphKernelNodeGetParams(nodes[i], ¶msRuntime[i]); // Get params using runtime | ||||||
|  |                 if(statRT == 98) { | ||||||
|  |                     // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node. | ||||||
|  |                     // We don't need to update blas nodes, so clear error and move on. | ||||||
|  |                     cudaGetLastError(); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // Update copy kernel param (required every token) | ||||||
|  |         // Currently uses runtime copy of params to identify copy function node, | ||||||
|  |         // and driver copy of params to perform the update | ||||||
|  |         // TO DO work out how to do it only using runtime copy. | ||||||
|  |         if(!cudaGraphUpdateRequired) { // on update steps, the live parameters will already be captured | ||||||
|  |             int k=0; | ||||||
|  |             for(size_t i=0; i<cudaGraph.numNodes; i++) { | ||||||
|  |                 if(paramsRuntime[i].func == ggmlCudaCpyFn) { | ||||||
|  |                     char** updatedKernelArgPointer = updatedKernelArg[k++]; | ||||||
|  |                     paramsDriver[i].kernelParams[1] = updatedKernelArgPointer; | ||||||
|  |                     CU_CHECK(cuGraphKernelNodeSetParams(nodes[i], ¶msDriver[i])); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // Update graph executable | ||||||
|  |         cudaGraphExecUpdateResultInfo resultInfo; | ||||||
|  |         CUDA_CHECK(cudaGraphExecUpdate(cudaGraph.instance, cudaGraph.graph, &resultInfo)); | ||||||
|  |  | ||||||
|  |         // Launch graph | ||||||
|  |         CUDA_CHECK(cudaGraphLaunch(cudaGraph.instance, cuda_ctx->stream())); | ||||||
|  |     } | ||||||
|  |     cudaGraph.count++; | ||||||
|     return GGML_STATUS_SUCCESS; |     return GGML_STATUS_SUCCESS; | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -459,3 +459,32 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { | |||||||
|     const ggml_tensor * src0 = dst->src[0]; |     const ggml_tensor * src0 = dst->src[0]; | ||||||
|     ggml_cuda_cpy(ctx, src0, dst); |     ggml_cuda_cpy(ctx, src0, dst); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { | ||||||
|  |     if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { | ||||||
|  |             return (void*) cpy_f32_f16<cpy_1_f32_f32>; | ||||||
|  |     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { | ||||||
|  |             return (void*) cpy_f32_f16<cpy_1_f32_f16>; | ||||||
|  |     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { | ||||||
|  |             return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>; | ||||||
|  |     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { | ||||||
|  |             return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>; | ||||||
|  |     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { | ||||||
|  |             return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>; | ||||||
|  |     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) { | ||||||
|  |             return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>; | ||||||
|  |     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) { | ||||||
|  |             return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>; | ||||||
|  |     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) { | ||||||
|  |             return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>; | ||||||
|  |     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { | ||||||
|  |             return (void*) cpy_f32_f16<cpy_1_f32_f16>; | ||||||
|  |     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { | ||||||
|  |             return (void*) cpy_f32_f16<cpy_1_f16_f32>; | ||||||
|  |     } else { | ||||||
|  |         fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, | ||||||
|  |                 ggml_type_name(src0->type), ggml_type_name(src1->type)); | ||||||
|  |         GGML_ASSERT(false); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -5,3 +5,5 @@ | |||||||
| void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1); | void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1); | ||||||
|  |  | ||||||
| void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst); | ||||||
|  |  | ||||||
|  | void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Alan Gray
					Alan Gray