mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml : do not sched_yield when calling BLAS (#4761)
* ggml : do not sched_yield when calling BLAS ggml-ci * ggml : fix do_yield logic ggml-ci * ggml : simplify do_yield logic ggml-ci
This commit is contained in:
		
							
								
								
									
										39
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										39
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -9704,10 +9704,10 @@ static void ggml_compute_forward_group_norm( | |||||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||||||
| // helper function to determine if it is better to use BLAS or not | // helper function to determine if it is better to use BLAS or not | ||||||
| // for large matrices, BLAS is faster | // for large matrices, BLAS is faster | ||||||
| static bool ggml_compute_forward_mul_mat_use_blas( | static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) { | ||||||
|         const struct ggml_tensor * src0, |     const struct ggml_tensor * src0 = dst->src[0]; | ||||||
|         const struct ggml_tensor * src1, |     const struct ggml_tensor * src1 = dst->src[1]; | ||||||
|               struct ggml_tensor * dst) { |  | ||||||
|     //const int64_t ne00 = src0->ne[0]; |     //const int64_t ne00 = src0->ne[0]; | ||||||
|     //const int64_t ne01 = src0->ne[1]; |     //const int64_t ne01 = src0->ne[1]; | ||||||
|  |  | ||||||
| @@ -9787,7 +9787,7 @@ static void ggml_compute_forward_mul_mat( | |||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||||||
|     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { |     if (ggml_compute_forward_mul_mat_use_blas(dst)) { | ||||||
|         if (params->ith != 0) { |         if (params->ith != 0) { | ||||||
|             return; |             return; | ||||||
|         } |         } | ||||||
| @@ -16301,24 +16301,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { | |||||||
|  |  | ||||||
|                 //n_tasks = MIN(n_threads, MAX(1, nr0/128)); |                 //n_tasks = MIN(n_threads, MAX(1, nr0/128)); | ||||||
|                 //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); |                 //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); | ||||||
|  |  | ||||||
| #if defined(GGML_USE_CUBLAS) |  | ||||||
|                 if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) { |  | ||||||
|                     n_tasks = 1; // TODO: this actually is doing nothing |  | ||||||
|                                  //       the threads are still spinning |  | ||||||
|                 } |  | ||||||
| #elif defined(GGML_USE_CLBLAST) |  | ||||||
|                 if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { |  | ||||||
|                     n_tasks = 1; // TODO: this actually is doing nothing |  | ||||||
|                                  //       the threads are still spinning |  | ||||||
|                 } |  | ||||||
| #endif |  | ||||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) |  | ||||||
|                 if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { |  | ||||||
|                     n_tasks = 1; // TODO: this actually is doing nothing |  | ||||||
|                                  //       the threads are still spinning |  | ||||||
|                 } |  | ||||||
| #endif |  | ||||||
|             } break; |             } break; | ||||||
|         case GGML_OP_MUL_MAT_ID: |         case GGML_OP_MUL_MAT_ID: | ||||||
|             { |             { | ||||||
| @@ -16491,6 +16473,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | |||||||
|             state->shared->node_n += 1; |             state->shared->node_n += 1; | ||||||
|             return (thread_ret_t) GGML_EXIT_ABORTED; |             return (thread_ret_t) GGML_EXIT_ABORTED; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { |         if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { | ||||||
|             // all other threads are finished and spinning |             // all other threads are finished and spinning | ||||||
|             // do finalize and init here so we don't have synchronize again |             // do finalize and init here so we don't have synchronize again | ||||||
| @@ -16556,14 +16539,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | |||||||
|         } else { |         } else { | ||||||
|             // wait for other threads to finish |             // wait for other threads to finish | ||||||
|             const int last = node_n; |             const int last = node_n; | ||||||
|  |  | ||||||
|  |             const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT; | ||||||
|  |  | ||||||
|             while (true) { |             while (true) { | ||||||
|                 // TODO: this sched_yield can have significant impact on the performance - either positive or negative |                 // TODO: this sched_yield can have significant impact on the performance - either positive or negative | ||||||
|                 //       depending on the workload and the operating system. |                 //       depending on the workload and the operating system. | ||||||
|                 //       since it is not clear what is the best approach, it should potentially become user-configurable |                 //       since it is not clear what is the best approach, it should potentially become user-configurable | ||||||
|                 //       ref: https://github.com/ggerganov/ggml/issues/291 |                 //       ref: https://github.com/ggerganov/ggml/issues/291 | ||||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) |                 // UPD:  adding the do_yield flag seems to resolve the issue universally | ||||||
|  |                 if (do_yield) { | ||||||
|                     sched_yield(); |                     sched_yield(); | ||||||
| #endif |                 } | ||||||
|  |  | ||||||
|                 node_n = atomic_load(&state->shared->node_n); |                 node_n = atomic_load(&state->shared->node_n); | ||||||
|                 if (node_n != last) break; |                 if (node_n != last) break; | ||||||
| @@ -16642,7 +16629,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { | |||||||
|                     } else |                     } else | ||||||
| #endif | #endif | ||||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||||||
|                     if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { |                     if (ggml_compute_forward_mul_mat_use_blas(node)) { | ||||||
|                         if (node->src[0]->type != GGML_TYPE_F32) { |                         if (node->src[0]->type != GGML_TYPE_F32) { | ||||||
|                             // here we need memory just for single 2D matrix from src0 |                             // here we need memory just for single 2D matrix from src0 | ||||||
|                             cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); |                             cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov