mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	ggml : parallelize FP32 conversion when using BLAS (#5045)
* make GGML_TASK_INIT phase can be run in multithread * multithreaded dequantize in mul_mat when using blas library * minor fixes * update outdated comment * fix coding style * simplify code Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										178
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										178
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -7810,6 +7810,9 @@ static void ggml_compute_forward_acc_f32( | ||||
|     bool   inplace = (bool) ((int32_t *) dst->op_params)[4]; | ||||
|  | ||||
|     if (!inplace && (params->type == GGML_TASK_INIT)) { | ||||
|         if (params->ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         // memcpy needs to be synchronized across threads to avoid race conditions. | ||||
|         // => do it in INIT phase | ||||
|         memcpy( | ||||
| @@ -9952,11 +9955,30 @@ static void ggml_compute_forward_mul_mat( | ||||
|  | ||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||||
|     if (ggml_compute_forward_mul_mat_use_blas(dst)) { | ||||
|         if (params->ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         const int64_t ne_plane      = ne01*ne00; | ||||
|         const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float); | ||||
|         UNUSED(desired_wsize); | ||||
|  | ||||
|         if (params->type == GGML_TASK_INIT) { | ||||
|             if (type != GGML_TYPE_F32) { | ||||
|                 assert(params->wsize >= desired_wsize); | ||||
|                 // parallelize by src0 rows | ||||
|                 for (int64_t i13 = 0; i13 < ne13; i13++) { | ||||
|                     for (int64_t i12 = 0; i12 < ne12; i12++) { | ||||
|                         // broadcast src0 into src1 across 2nd,3rd dimension | ||||
|                         const int64_t i03 = i13/r3; | ||||
|                         const int64_t i02 = i12/r2; | ||||
|  | ||||
|                         const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03; | ||||
|                               float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; | ||||
|                               ggml_to_float_t  const to_float = type_traits[type].to_float; | ||||
|  | ||||
|                         for (int64_t i01 = ith; i01 < ne01; i01 += nth) { | ||||
|                             to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             return; | ||||
|         } | ||||
|  | ||||
| @@ -9964,9 +9986,14 @@ static void ggml_compute_forward_mul_mat( | ||||
|             return; | ||||
|         } | ||||
|  | ||||
|         // perform sgemm, parallelization controlled by blas lib | ||||
|         if (ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|  | ||||
|         const int64_t tgemm0 = ggml_perf_time_us(); | ||||
|         for (int64_t i13 = 0; i13 < ne13; i13++) { | ||||
|             for (int64_t i12 = 0; i12 < ne12; i12++) { | ||||
|                 // broadcast src0 into src1 across 2nd,3rd dimension | ||||
|                 const int64_t i03 = i13/r3; | ||||
|                 const int64_t i02 = i12/r2; | ||||
|  | ||||
| @@ -9975,17 +10002,7 @@ static void ggml_compute_forward_mul_mat( | ||||
|                       float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3); | ||||
|  | ||||
|                 if (type != GGML_TYPE_F32) { | ||||
|                             float * const wdata    = params->wdata; | ||||
|                     ggml_to_float_t const to_float = type_traits[type].to_float; | ||||
|  | ||||
|                     size_t id = 0; | ||||
|                     for (int64_t i01 = 0; i01 < ne01; ++i01) { | ||||
|                         to_float((const char *) x + i01*nb01, wdata + id, ne00); | ||||
|                         id += ne00; | ||||
|                     } | ||||
|  | ||||
|                     assert(id*sizeof(float) <= params->wsize); | ||||
|                     x = wdata; | ||||
|                     x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; | ||||
|                 } | ||||
|  | ||||
|                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, | ||||
| @@ -9995,6 +10012,7 @@ static void ggml_compute_forward_mul_mat( | ||||
|                          0.0f,    d, ne01); | ||||
|             } | ||||
|         } | ||||
|         //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2); | ||||
|  | ||||
|         //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); | ||||
|  | ||||
| @@ -10003,6 +10021,9 @@ static void ggml_compute_forward_mul_mat( | ||||
| #endif | ||||
|  | ||||
|     if (params->type == GGML_TASK_INIT) { | ||||
|         if (ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         if (src1->type != vec_dot_type) { | ||||
|             char * wdata = params->wdata; | ||||
|             const size_t row_size = ggml_row_size(vec_dot_type, ne10); | ||||
| @@ -10167,6 +10188,9 @@ static void ggml_compute_forward_mul_mat_id( | ||||
|     #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] | ||||
|  | ||||
|    if (params->type == GGML_TASK_INIT) { | ||||
|         if (ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         char * wdata = params->wdata; | ||||
|         if (src1->type != vec_dot_type) { | ||||
|             const size_t row_size = ggml_row_size(vec_dot_type, ne10); | ||||
| @@ -10352,6 +10376,9 @@ static void ggml_compute_forward_out_prod_f32( | ||||
|             return; | ||||
|         } | ||||
| #endif | ||||
|         if (ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); | ||||
|         return; | ||||
|     } | ||||
| @@ -10535,6 +10562,9 @@ static void ggml_compute_forward_out_prod_q_f32( | ||||
|     // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) | ||||
|  | ||||
|     if (params->type == GGML_TASK_INIT) { | ||||
|         if (ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); | ||||
|         return; | ||||
|     } | ||||
| @@ -10719,6 +10749,9 @@ static void ggml_compute_forward_set_f32( | ||||
|     bool   inplace = (bool) ((int32_t *) dst->op_params)[4]; | ||||
|  | ||||
|     if (!inplace && (params->type == GGML_TASK_INIT)) { | ||||
|         if (params->ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         // memcpy needs to be synchronized across threads to avoid race conditions. | ||||
|         // => do it in INIT phase | ||||
|         memcpy( | ||||
| @@ -11043,6 +11076,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16( | ||||
|     // ggml_compute_forward_dup_same_cont(params, opt0, dst); | ||||
|  | ||||
|     if (params->type == GGML_TASK_INIT) { | ||||
|         if (params->ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         memset(dst->data, 0, ggml_nbytes(dst)); | ||||
|     } | ||||
|  | ||||
| @@ -11077,6 +11113,9 @@ static void ggml_compute_forward_get_rows_back_f32( | ||||
|     // ggml_compute_forward_dup_same_cont(params, opt0, dst); | ||||
|  | ||||
|     if (params->type == GGML_TASK_INIT) { | ||||
|         if (params->ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         memset(dst->data, 0, ggml_nbytes(dst)); | ||||
|     } | ||||
|  | ||||
| @@ -11214,6 +11253,9 @@ static void ggml_compute_forward_diag_mask_f32( | ||||
|     GGML_ASSERT(n_past >= 0); | ||||
|  | ||||
|     if (!inplace && (params->type == GGML_TASK_INIT)) { | ||||
|         if (ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         // memcpy needs to be synchronized across threads to avoid race conditions. | ||||
|         // => do it in INIT phase | ||||
|         GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); | ||||
| @@ -12184,6 +12226,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( | ||||
|     GGML_ASSERT(nb10 == sizeof(float)); | ||||
|  | ||||
|     if (params->type == GGML_TASK_INIT) { | ||||
|         if (ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         memset(params->wdata, 0, params->wsize); | ||||
|  | ||||
|         // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) | ||||
| @@ -12278,6 +12323,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32( | ||||
|     GGML_ASSERT(nb10 == sizeof(float)); | ||||
|  | ||||
|     if (params->type == GGML_TASK_INIT) { | ||||
|         if (ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         memset(params->wdata, 0, params->wsize); | ||||
|  | ||||
|         // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) | ||||
| @@ -12502,6 +12550,9 @@ static void ggml_compute_forward_conv_transpose_2d( | ||||
|     GGML_ASSERT(nb10 == sizeof(float)); | ||||
|  | ||||
|     if (params->type == GGML_TASK_INIT) { | ||||
|         if (ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         memset(params->wdata, 0, params->wsize); | ||||
|  | ||||
|         // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) | ||||
| @@ -14116,6 +14167,9 @@ static void ggml_compute_forward_add_rel_pos_f32( | ||||
|  | ||||
|     const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; | ||||
|     if (!inplace && params->type == GGML_TASK_INIT) { | ||||
|         if (params->ith != 0) { | ||||
|             return; | ||||
|         } | ||||
|         memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); | ||||
|         return; | ||||
|     } | ||||
| @@ -16411,6 +16465,7 @@ struct ggml_compute_state_shared { | ||||
|     // synchronization primitives | ||||
|     atomic_int n_active;  // num active threads | ||||
|     atomic_int node_n;    // active graph node | ||||
|     atomic_int node_task; // active graph node task phase | ||||
|  | ||||
|     bool (*abort_callback)(void * data); // abort ggml_graph_compute when true | ||||
|     void * abort_callback_data; | ||||
| @@ -16658,6 +16713,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { | ||||
|     return n_tasks; | ||||
| } | ||||
|  | ||||
| static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) { | ||||
|     // wait for other threads to finish | ||||
|     const int last_node_n = * node_n; | ||||
|  | ||||
|     while (true) { | ||||
|         if (do_yield) { | ||||
|             sched_yield(); | ||||
|         } | ||||
|  | ||||
|         * node_n = atomic_load(&state->shared->node_n); | ||||
|         if (* node_n != last_node_n) break; | ||||
|     } | ||||
| } | ||||
|  | ||||
| static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) { | ||||
|     // wait for other threads to finish | ||||
|     const int last_task_phase = * task_phase; | ||||
|  | ||||
|     while (true) { | ||||
|         if (do_yield) { | ||||
|             sched_yield(); | ||||
|         } | ||||
|  | ||||
|         * task_phase = atomic_load(&state->shared->node_task); | ||||
|         if (* task_phase != last_task_phase) break; | ||||
|     } | ||||
| } | ||||
|  | ||||
| static thread_ret_t ggml_graph_compute_thread(void * data) { | ||||
|     struct ggml_compute_state * state = (struct ggml_compute_state *) data; | ||||
|  | ||||
| @@ -16669,6 +16752,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | ||||
|     set_numa_thread_affinity(state->ith, n_threads); | ||||
|  | ||||
|     int node_n     = -1; | ||||
|     int task_phase = GGML_TASK_FINALIZE; | ||||
|  | ||||
|     while (true) { | ||||
|         if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { | ||||
| @@ -16708,13 +16792,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | ||||
|  | ||||
|                 params.nth = n_tasks; | ||||
|  | ||||
|                 if (n_tasks == 1) { | ||||
|                     /* INIT */ | ||||
|                     if (GGML_OP_HAS_INIT[node->op]) { | ||||
|                         params.type = GGML_TASK_INIT; | ||||
|                         ggml_compute_forward(¶ms, node); | ||||
|                     } | ||||
|  | ||||
|                 if (n_tasks == 1) { | ||||
|                     // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, | ||||
|                     // they do something more efficient than spinning (?) | ||||
|                     params.type = GGML_TASK_COMPUTE; | ||||
| @@ -16735,38 +16819,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             task_phase = GGML_TASK_INIT; | ||||
|             atomic_store(&state->shared->n_active,  n_threads); | ||||
|             atomic_store(&state->shared->node_n,    node_n); | ||||
|             atomic_store(&state->shared->node_task, task_phase); | ||||
|         } else { | ||||
|             // wait for other threads to finish | ||||
|             const int last = node_n; | ||||
|  | ||||
|             const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT; | ||||
|  | ||||
|             while (true) { | ||||
|                 // TODO: this sched_yield can have significant impact on the performance - either positive or negative | ||||
|                 //       depending on the workload and the operating system. | ||||
|                 //       since it is not clear what is the best approach, it should potentially become user-configurable | ||||
|                 //       ref: https://github.com/ggerganov/ggml/issues/291 | ||||
|                 // UPD:  adding the do_yield flag seems to resolve the issue universally | ||||
|                 if (do_yield) { | ||||
|                     sched_yield(); | ||||
|                 } | ||||
|  | ||||
|                 node_n = atomic_load(&state->shared->node_n); | ||||
|                 if (node_n != last) break; | ||||
|             }; | ||||
|             ggml_graph_compute_thread_sync_node(&node_n,     state, false); | ||||
|             ggml_graph_compute_thread_sync_task(&task_phase, state, false); | ||||
|         } | ||||
|  | ||||
|         // check if we should stop | ||||
|         if (node_n >= cgraph->n_nodes) break; | ||||
|  | ||||
|         /* COMPUTE */ | ||||
|         /* INIT & COMPUTE */ | ||||
|         struct ggml_tensor * node = cgraph->nodes[node_n]; | ||||
|         const int n_tasks = ggml_get_n_tasks(node, n_threads); | ||||
|  | ||||
|         struct ggml_compute_params params = { | ||||
|             /*.type  =*/ GGML_TASK_COMPUTE, | ||||
|             /*.type  =*/ GGML_TASK_INIT, | ||||
|             /*.ith   =*/ state->ith, | ||||
|             /*.nth   =*/ n_tasks, | ||||
|             /*.wsize =*/ cplan->work_size, | ||||
| @@ -16774,10 +16844,41 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | ||||
|         }; | ||||
|  | ||||
|         if (state->ith < n_tasks) { | ||||
|             if (GGML_OP_HAS_INIT[node->op]) { | ||||
|                 ggml_compute_forward(¶ms, node); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { | ||||
|             task_phase = GGML_TASK_COMPUTE; | ||||
|             atomic_store(&state->shared->n_active,  n_threads); | ||||
|             atomic_store(&state->shared->node_task, task_phase); | ||||
|         } | ||||
|         else { | ||||
|             // TODO: this sched_yield can have significant impact on the performance - either positive or negative | ||||
|             //       depending on the workload and the operating system. | ||||
|             //       since it is not clear what is the best approach, it should potentially become user-configurable | ||||
|             //       ref: https://github.com/ggerganov/ggml/issues/291 | ||||
|             // UPD:  adding the do_yield flag seems to resolve the issue universally | ||||
|             const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT; | ||||
|             ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield); | ||||
|         } | ||||
|  | ||||
|         if (state->ith < n_tasks) { | ||||
|             params.type = GGML_TASK_COMPUTE; | ||||
|             ggml_compute_forward(¶ms, node); | ||||
|         } | ||||
|  | ||||
|         if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { | ||||
|             task_phase = GGML_TASK_FINALIZE; | ||||
|             atomic_store(&state->shared->n_active,  n_threads); | ||||
|             atomic_store(&state->shared->node_task, task_phase); | ||||
|         } | ||||
|         else { | ||||
|             ggml_graph_compute_thread_sync_task(&task_phase, state, false); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return GGML_EXIT_SUCCESS; | ||||
| } | ||||
|  | ||||
| @@ -16832,8 +16933,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa | ||||
| #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) | ||||
|                     if (ggml_compute_forward_mul_mat_use_blas(node)) { | ||||
|                         if (node->src[0]->type != GGML_TYPE_F32) { | ||||
|                             // here we need memory just for single 2D matrix from src0 | ||||
|                             cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); | ||||
|                             // here we need memory for fully dequantized matrix from src0 | ||||
|                             cur = ggml_type_size(GGML_TYPE_F32)*ggml_nelements(node->src[0]); | ||||
|                         } | ||||
|                     } else | ||||
| #endif | ||||
| @@ -16987,6 +17088,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { | ||||
|         /*.n_threads               =*/ n_threads, | ||||
|         /*.n_active                =*/ n_threads, | ||||
|         /*.node_n                  =*/ -1, | ||||
|         /*.node_task               =*/ GGML_TASK_FINALIZE, | ||||
|         /*.abort_callback          =*/ NULL, | ||||
|         /*.abort_callback_data     =*/ NULL, | ||||
|     }; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Reinforce-II
					Reinforce-II