mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml : disable GGML_TASK_INIT and GGML_TASK_FINALIZE by default (#1995)
Will not be scheduled unless explicitly enabled.
This commit is contained in:
		
							
								
								
									
										61
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										61
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -3846,6 +3846,40 @@ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64"); | |||||||
| static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); | static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); | ||||||
| static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); | static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); | ||||||
|  |  | ||||||
|  | // WARN: | ||||||
|  | // Mis-confguration can lead to problem that's hard to reason about: | ||||||
|  | // * At best  it crash or talks nosense. | ||||||
|  | // * At worst it talks slightly difference but hard to perceive. | ||||||
|  | // | ||||||
|  | // An op has to enable INIT or FINALIZE when any of it's branch needs that pass. | ||||||
|  | // Take care about compile options (e.g., GGML_USE_xxx). | ||||||
|  | static bool GGML_OP_HAS_INIT    [GGML_OP_COUNT] = { 0 }; | ||||||
|  | static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 }; | ||||||
|  | static void ggml_setup_op_has_task_pass(void) { | ||||||
|  |     {   // INIT | ||||||
|  |         bool * I = GGML_OP_HAS_INIT; | ||||||
|  |  | ||||||
|  |         I[GGML_OP_ACC                    ] = true; | ||||||
|  |         I[GGML_OP_MUL_MAT                ] = true; | ||||||
|  |         I[GGML_OP_OUT_PROD               ] = true; | ||||||
|  |         I[GGML_OP_SET                    ] = true; | ||||||
|  |         I[GGML_OP_GET_ROWS_BACK          ] = true; | ||||||
|  |         I[GGML_OP_DIAG_MASK_INF          ] = true; | ||||||
|  |         I[GGML_OP_DIAG_MASK_ZERO         ] = true; | ||||||
|  |         I[GGML_OP_CONV_1D_S1_PH          ] = true; | ||||||
|  |         I[GGML_OP_CONV_1D_S2_PH          ] = true; | ||||||
|  |         I[GGML_OP_CONV_2D_SK_P0          ] = true; | ||||||
|  |         I[GGML_OP_FLASH_ATTN_BACK        ] = true; | ||||||
|  |         I[GGML_OP_CROSS_ENTROPY_LOSS     ] = true; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     {   // FINALIZE | ||||||
|  |         bool * F = GGML_OP_HAS_FINALIZE; | ||||||
|  |  | ||||||
|  |         F[GGML_OP_CROSS_ENTROPY_LOSS     ] = true; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| // | // | ||||||
| // ggml context | // ggml context | ||||||
| // | // | ||||||
| @@ -4267,6 +4301,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { | |||||||
|         ggml_cl_init(); |         ggml_cl_init(); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |         ggml_setup_op_has_task_pass(); | ||||||
|  |  | ||||||
|         is_first_call = false; |         is_first_call = false; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -16791,9 +16827,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | |||||||
|             if (node_n != -1) { |             if (node_n != -1) { | ||||||
|                 /* FINALIZE */ |                 /* FINALIZE */ | ||||||
|                 struct ggml_tensor * node = state->shared->cgraph->nodes[node_n]; |                 struct ggml_tensor * node = state->shared->cgraph->nodes[node_n]; | ||||||
|                 params.nth = node->n_tasks; |                 if (GGML_OP_HAS_FINALIZE[node->op]) { | ||||||
|                 ggml_compute_forward(¶ms, node); |                     params.nth = node->n_tasks; | ||||||
|                 ggml_graph_compute_perf_stats_node(node, state->shared); |                     ggml_compute_forward(¶ms, node); | ||||||
|  |                     ggml_graph_compute_perf_stats_node(node, state->shared); | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             // distribute new work or execute it direct if 1T |             // distribute new work or execute it direct if 1T | ||||||
| @@ -16805,10 +16843,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | |||||||
|                 state->shared->perf_node_start_cycles  = ggml_perf_cycles(); |                 state->shared->perf_node_start_cycles  = ggml_perf_cycles(); | ||||||
|                 state->shared->perf_node_start_time_us = ggml_perf_time_us(); |                 state->shared->perf_node_start_time_us = ggml_perf_time_us(); | ||||||
|  |  | ||||||
|  |                 params.nth = node->n_tasks; | ||||||
|  |  | ||||||
|                 /* INIT */ |                 /* INIT */ | ||||||
|                 params.type = GGML_TASK_INIT; |                 if (GGML_OP_HAS_INIT[node->op]) { | ||||||
|                 params.nth  = node->n_tasks; |                     params.type = GGML_TASK_INIT; | ||||||
|                 ggml_compute_forward(¶ms, node); |                     ggml_compute_forward(¶ms, node); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|                 if (node->n_tasks == 1) { |                 if (node->n_tasks == 1) { | ||||||
|                     // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, |                     // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, | ||||||
| @@ -16816,9 +16857,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | |||||||
|                     params.type = GGML_TASK_COMPUTE; |                     params.type = GGML_TASK_COMPUTE; | ||||||
|                     ggml_compute_forward(¶ms, node); |                     ggml_compute_forward(¶ms, node); | ||||||
|  |  | ||||||
|                     params.type = GGML_TASK_FINALIZE; |                     if (GGML_OP_HAS_FINALIZE[node->op]) { | ||||||
|                     ggml_compute_forward(¶ms, node); |                         params.type = GGML_TASK_FINALIZE; | ||||||
|                     ggml_graph_compute_perf_stats_node(node, state->shared); |                         ggml_compute_forward(¶ms, node); | ||||||
|  |                         ggml_graph_compute_perf_stats_node(node, state->shared); | ||||||
|  |                     } | ||||||
|                 } else { |                 } else { | ||||||
|                     break; |                     break; | ||||||
|                 } |                 } | ||||||
|   | |||||||
							
								
								
									
										3
									
								
								ggml.h
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								ggml.h
									
									
									
									
									
								
							| @@ -444,6 +444,9 @@ extern "C" { | |||||||
|  |  | ||||||
|  |  | ||||||
|     // compute types |     // compute types | ||||||
|  |  | ||||||
|  |     // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. | ||||||
|  |     // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. | ||||||
|     enum ggml_task_type { |     enum ggml_task_type { | ||||||
|         GGML_TASK_INIT = 0, |         GGML_TASK_INIT = 0, | ||||||
|         GGML_TASK_COMPUTE, |         GGML_TASK_COMPUTE, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Qingyou Meng
					Qingyou Meng