mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	cuda : Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs (#14741)
* Fix Gemma3n not executed as CUDA_GRAPH on NVGPUs Gemma3n uses Matrix-Matrix addition as part of their input processing, wrongly triggering CUDA_GRAPH disablement on NVGPUs even when batch-size of 1 is used. * Exclude `project_per_layer_input` by matching node names This ensures that all other graphs which don't exhibit this pattern do not have their behavior changed. * Revert unnecessary formatting changes
This commit is contained in:
		| @@ -2590,6 +2590,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud | |||||||
|     // Loop over nodes in GGML graph to obtain info needed for CUDA graph |     // Loop over nodes in GGML graph to obtain info needed for CUDA graph | ||||||
|     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); |     cuda_ctx->cuda_graph->cpy_dest_ptrs.clear(); | ||||||
|  |  | ||||||
|  |     const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected"; | ||||||
|  |     const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj"; | ||||||
|  |  | ||||||
|     for (int i = 0; i < cgraph->n_nodes; i++) { |     for (int i = 0; i < cgraph->n_nodes; i++) { | ||||||
|         ggml_tensor * node = cgraph->nodes[i]; |         ggml_tensor * node = cgraph->nodes[i]; | ||||||
|  |  | ||||||
| @@ -2611,9 +2614,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud | |||||||
| #endif | #endif | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { |         if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) { | ||||||
|             // disable CUDA graphs for batch size > 1 for now. |             // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation | ||||||
|             // Changes in batch size or context size can cause changes to the grid size of some kernels. |             // by means of matching node names. See | ||||||
|  |             // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and | ||||||
|  |             // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773, | ||||||
|  |             // Generally, changes in batch size or context size can cause changes to the grid size of some kernels. | ||||||
|             use_cuda_graph = false; |             use_cuda_graph = false; | ||||||
| #ifndef NDEBUG | #ifndef NDEBUG | ||||||
|             GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); |             GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Oliver Simons
					Oliver Simons