mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	CLBlast: Add broadcast support for matrix multiplication (#3402)
Broadcast src0 into src1 across dimensions 2 and 3 when needed. This is required for models that use GQA.
This commit is contained in:
		| @@ -1476,10 +1476,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr | ||||
|  | ||||
|     const int64_t ne10 = src1->ne[0]; | ||||
|     const int64_t ne11 = src1->ne[1]; | ||||
|     const int64_t ne12 = src1->ne[2]; | ||||
|     const int64_t ne13 = src1->ne[3]; | ||||
|  | ||||
|     const int nb2  = dst->nb[2]; | ||||
|     const int nb3  = dst->nb[3]; | ||||
|  | ||||
|     const int64_t r2 = ne12 / ne02; | ||||
|     const int64_t r3 = ne13 / ne03; | ||||
|  | ||||
|     const float alpha = 1.0f; | ||||
|     const float beta = 0.0f; | ||||
|     const int x_ne = ne01 * ne00; | ||||
| @@ -1498,13 +1503,22 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr | ||||
|     cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); | ||||
|     cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); | ||||
|  | ||||
|     for (int64_t i03 = 0; i03 < ne03; i03++) { | ||||
|         for (int64_t i02 = 0; i02 < ne02; i02++) { | ||||
|     int64_t pi02 = -1; | ||||
|     int64_t pi03 = -1; | ||||
|  | ||||
|     for (int64_t i13 = 0; i13 < ne13; i13++) { | ||||
|         int64_t i03 = i13 / r3; | ||||
|  | ||||
|         for (int64_t i12 = 0; i12 < ne12; i12++) { | ||||
|             int64_t i02 = i12 / r2; | ||||
|  | ||||
|             // copy data to device | ||||
|             if (src0->backend != GGML_BACKEND_GPU) { | ||||
|             if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) { | ||||
|                 CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); | ||||
|                 pi02 = i02; | ||||
|                 pi03 = i03; | ||||
|             } | ||||
|             CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); | ||||
|             CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); | ||||
|  | ||||
|             CL_CHECK(clFinish(queue)); | ||||
|  | ||||
| @@ -1525,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr | ||||
|             } | ||||
|  | ||||
|             // copy dst to host | ||||
|             float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); | ||||
|             float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); | ||||
|             CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); | ||||
|         } | ||||
|     } | ||||
| @@ -1547,6 +1561,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr | ||||
|  | ||||
|     const int64_t ne10 = src1->ne[0]; | ||||
|     const int64_t ne11 = src1->ne[1]; | ||||
|     const int64_t ne12 = src1->ne[2]; | ||||
|     const int64_t ne13 = src1->ne[3]; | ||||
|  | ||||
|     const int nb10 = src1->nb[0]; | ||||
|     const int nb11 = src1->nb[1]; | ||||
| @@ -1556,6 +1572,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr | ||||
|     const int nb2  = dst->nb[2]; | ||||
|     const int nb3  = dst->nb[3]; | ||||
|  | ||||
|     const int64_t r2 = ne12 / ne02; | ||||
|     const int64_t r3 = ne13 / ne03; | ||||
|  | ||||
|     const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f); | ||||
|     const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f); | ||||
|     const int x_ne = ne01 * ne00; | ||||
| @@ -1577,32 +1596,41 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr | ||||
|     bool src1_cont_rows = nb10 == sizeof(float); | ||||
|     bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); | ||||
|  | ||||
|     for (int64_t i03 = 0; i03 < ne03; i03++) { | ||||
|         for (int64_t i02 = 0; i02 < ne02; i02++) { | ||||
|     int64_t pi02 = -1; | ||||
|     int64_t pi03 = -1; | ||||
|  | ||||
|     for (int64_t i13 = 0; i13 < ne13; i13++) { | ||||
|         int64_t i03 = i13 / r3; | ||||
|  | ||||
|         for (int64_t i12 = 0; i12 < ne12; i12++) { | ||||
|             int64_t i02 = i12 / r2; | ||||
|  | ||||
|             // copy src0 to device | ||||
|             if (src0->backend != GGML_BACKEND_GPU) { | ||||
|             if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) { | ||||
|                 CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); | ||||
|                 pi02 = i02; | ||||
|                 pi03 = i03; | ||||
|             } | ||||
|  | ||||
|             // convert src1 to fp16 | ||||
|             // TODO: use multiple threads | ||||
|             ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02); | ||||
|             char * src1i = (char *) src1->data + i03*nb13 + i02*nb12; | ||||
|             ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12); | ||||
|             char * src1i = (char *) src1->data + i13*nb13 + i12*nb12; | ||||
|             if (src1_cont_rows) { | ||||
|                 if (src1_cont_cols) { | ||||
|                     ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); | ||||
|                 } | ||||
|                 else { | ||||
|                     for (int64_t i01 = 0; i01 < ne11; i01++) { | ||||
|                         ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10); | ||||
|                     for (int64_t i11 = 0; i11 < ne11; i11++) { | ||||
|                         ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             else { | ||||
|                 for (int64_t i01 = 0; i01 < ne11; i01++) { | ||||
|                     for (int64_t i00 = 0; i00 < ne10; i00++) { | ||||
|                 for (int64_t i11 = 0; i11 < ne11; i11++) { | ||||
|                     for (int64_t i10 = 0; i10 < ne10; i10++) { | ||||
|                         // very slow due to no inlining | ||||
|                         tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10)); | ||||
|                         tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10)); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
| @@ -1631,7 +1659,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr | ||||
|             // copy dst to host, then convert to float | ||||
|             CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); | ||||
|  | ||||
|             float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); | ||||
|             float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); | ||||
|  | ||||
|             ggml_fp16_to_fp32_row(tmp, d, d_ne); | ||||
|         } | ||||
| @@ -1652,12 +1680,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * | ||||
|  | ||||
|     const int64_t ne10 = src1->ne[0]; | ||||
|     const int64_t ne11 = src1->ne[1]; | ||||
|     const int64_t ne12 = src1->ne[2]; | ||||
|     const int64_t ne13 = src1->ne[3]; | ||||
|  | ||||
|     const int nb2  = dst->nb[2]; | ||||
|     const int nb3  = dst->nb[3]; | ||||
|     const ggml_type type = src0->type; | ||||
|     const bool mul_mat_vec = ne11 == 1; | ||||
|  | ||||
|     const int64_t r2 = ne12 / ne02; | ||||
|     const int64_t r3 = ne13 / ne03; | ||||
|  | ||||
|     const float alpha = 1.0f; | ||||
|     const float beta = 0.0f; | ||||
|     const int x_ne = ne01 * ne00; | ||||
| @@ -1690,12 +1723,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * | ||||
|     size_t ev_idx = 0; | ||||
|     std::vector<cl_event> events; | ||||
|  | ||||
|     for (int64_t i03 = 0; i03 < ne03; i03++) { | ||||
|         for (int64_t i02 = 0; i02 < ne02; i02++) { | ||||
|     int64_t pi02 = -1; | ||||
|     int64_t pi03 = -1; | ||||
|  | ||||
|     for (int64_t i13 = 0; i13 < ne13; i13++) { | ||||
|         int64_t i03 = i13 / r3; | ||||
|  | ||||
|         for (int64_t i12 = 0; i12 < ne12; i12++) { | ||||
|             int64_t i02 = i12 / r2; | ||||
|  | ||||
|             // copy src0 to device if necessary | ||||
|             if (src0->backend == GGML_BACKEND_CPU) { | ||||
|                 if (i02 != pi02 || i03 != pi03) { | ||||
|                     events.emplace_back(); | ||||
|                     CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); | ||||
|                     pi02 = i02; | ||||
|                     pi03 = i03; | ||||
|                 } | ||||
|             } else if (src0->backend == GGML_BACKEND_GPU) { | ||||
|                 d_Q = (cl_mem) src0->extra; | ||||
|             } else { | ||||
| @@ -1704,7 +1748,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * | ||||
|             if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel | ||||
|                 // copy src1 to device | ||||
|                 events.emplace_back(); | ||||
|                 CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++)); | ||||
|                 CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++)); | ||||
|  | ||||
|                 // compute | ||||
|                 const size_t global = ne01 * CL_DMMV_BLOCK_SIZE; | ||||
| @@ -1725,7 +1769,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * | ||||
|                 CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); | ||||
|  | ||||
|                 // copy src1 to device | ||||
|                 CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL)); | ||||
|                 CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); | ||||
|  | ||||
|                 events.emplace_back(); | ||||
|  | ||||
| @@ -1749,7 +1793,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * | ||||
|             } | ||||
|  | ||||
|             // copy dst to host | ||||
|             float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); | ||||
|             float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); | ||||
|             CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL)); | ||||
|             for (auto *event : events) { | ||||
|                 clReleaseEvent(event); | ||||
|   | ||||
							
								
								
									
										5
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										5
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -11621,11 +11621,6 @@ static void ggml_compute_forward_mul_mat( | ||||
|  | ||||
| #if defined(GGML_USE_CLBLAST) | ||||
|     if (ggml_cl_can_mul_mat(src0, src1, dst)) { | ||||
|         // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension | ||||
|         //       ref: https://github.com/ggerganov/ggml/pull/224 | ||||
|         GGML_ASSERT(ne02 == ne12); | ||||
|         GGML_ASSERT(ne03 == ne13); | ||||
|  | ||||
|         if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { | ||||
|             ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 shibe2
					shibe2