mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Add OpenCL add kernel (#5151)
* Add OpenCL add kernel * Put add kernel into different string to stay within MSVC string length limit, disable float16 support due to bad results
This commit is contained in:
		| @@ -714,7 +714,6 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, | |||||||
|         dst[row] = tmp[0]; |         dst[row] = tmp[0]; | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| ); | ); | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -784,6 +783,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float | |||||||
|         dst[row] = tmp[0]; |         dst[row] = tmp[0]; | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| ); | ); | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -799,6 +799,18 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y | |||||||
| } | } | ||||||
| ); | ); | ||||||
|  |  | ||||||
|  | std::string add_template = MULTILINE_QUOTE( | ||||||
|  | __kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) { | ||||||
|  |     const int i = get_group_id(0)*get_local_size(0) + get_local_id(0); | ||||||
|  |  | ||||||
|  |     if (i >= get_global_size(0)) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky]; | ||||||
|  | } | ||||||
|  | ); | ||||||
|  |  | ||||||
| #define CL_CHECK(err)                                               \ | #define CL_CHECK(err)                                               \ | ||||||
|     do {                                                            \ |     do {                                                            \ | ||||||
|         cl_int err_ = (err);                                        \ |         cl_int err_ = (err);                                        \ | ||||||
| @@ -878,6 +890,7 @@ static std::string generate_kernels() { | |||||||
|         } |         } | ||||||
|         src << mul_kernel << '\n'; |         src << mul_kernel << '\n'; | ||||||
|     } |     } | ||||||
|  |     src << add_template << '\n'; | ||||||
|  |  | ||||||
|     return src.str(); |     return src.str(); | ||||||
| } | } | ||||||
| @@ -893,6 +906,7 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, | |||||||
| static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl; | static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl; | ||||||
| static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl; | static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl; | ||||||
| static cl_kernel mul_f32_cl; | static cl_kernel mul_f32_cl; | ||||||
|  | static cl_kernel add_f32_cl; | ||||||
| static bool fp16_support; | static bool fp16_support; | ||||||
|  |  | ||||||
| static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { | static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { | ||||||
| @@ -1100,9 +1114,10 @@ void ggml_cl_init(void) { | |||||||
|     char *ext_buffer = (char *)alloca(ext_str_size + 1); |     char *ext_buffer = (char *)alloca(ext_str_size + 1); | ||||||
|     clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); |     clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); | ||||||
|     ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated |     ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated | ||||||
|  |     // Disabled due to faulty outputs | ||||||
|     // Check if ext_buffer contains cl_khr_fp16 |     // Check if ext_buffer contains cl_khr_fp16 | ||||||
|     fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; |     fp16_support = false;  // strstr(ext_buffer, "cl_khr_fp16") != NULL; | ||||||
|     fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false"); |     // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false"); | ||||||
|  |  | ||||||
|     cl_context_properties properties[] = { |     cl_context_properties properties[] = { | ||||||
|         (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0 |         (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0 | ||||||
| @@ -1150,6 +1165,8 @@ void ggml_cl_init(void) { | |||||||
|  |  | ||||||
|     // mul kernel |     // mul kernel | ||||||
|     CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); |     CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); | ||||||
|  |  | ||||||
|  |     CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err)); | ||||||
| } | } | ||||||
|  |  | ||||||
| static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { | static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { | ||||||
| @@ -1458,6 +1475,70 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src | |||||||
|     ggml_cl_mul_f32(src0, src1, dst); |     ggml_cl_mul_f32(src0, src1, dst); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
|  |     GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); | ||||||
|  |     const int64_t ne00 = src0->ne[0]; | ||||||
|  |     const int64_t ne01 = src0->ne[1]; | ||||||
|  |     const int64_t ne02 = src0->ne[2]; | ||||||
|  |     const int64_t ne03 = src0->ne[3]; | ||||||
|  |     const int64_t ne10 = src1->ne[0]; | ||||||
|  |     const int64_t ne11 = src1->ne[1]; | ||||||
|  |     const int64_t ne12 = src1->ne[2]; | ||||||
|  |     const int64_t ne13 = src1->ne[3]; | ||||||
|  |     const int nb2  = dst->nb[2]; | ||||||
|  |     const int nb3  = dst->nb[3]; | ||||||
|  |     size_t x_size; | ||||||
|  |     size_t d_size; | ||||||
|  |  | ||||||
|  |     cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0 | ||||||
|  |     cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted. | ||||||
|  |     cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     for (int64_t i03 = 0; i03 < ne03; i03++) { | ||||||
|  |         for (int64_t i02 = 0; i02 < ne02; i02++) { | ||||||
|  |             cl_event ev; | ||||||
|  |  | ||||||
|  |             // copy src0 to device | ||||||
|  |             CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev)); | ||||||
|  |  | ||||||
|  |             const int64_t i13 = i03%ne13; | ||||||
|  |             const int64_t i12 = i02%ne12; | ||||||
|  |             const int i1 = i13*ne12*ne11 + i12*ne11; | ||||||
|  |  | ||||||
|  |             cl_int x_offset = 0; | ||||||
|  |             cl_int y_offset = i1*ne10; | ||||||
|  |             cl_int d_offset = 0; | ||||||
|  |  | ||||||
|  |             size_t global = ne00 * ne01; | ||||||
|  |             cl_int ky = ne10 * ne11; | ||||||
|  |  | ||||||
|  |             CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X)); | ||||||
|  |             CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset)); | ||||||
|  |             CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y)); | ||||||
|  |             CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset)); | ||||||
|  |             CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D)); | ||||||
|  |             CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset)); | ||||||
|  |             CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky)); | ||||||
|  |             CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL)); | ||||||
|  |  | ||||||
|  |             CL_CHECK(clReleaseEvent(ev)); | ||||||
|  |             CL_CHECK(clFinish(queue)); | ||||||
|  |  | ||||||
|  |             // copy dst to host | ||||||
|  |             float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); | ||||||
|  |             CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL)); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     ggml_cl_pool_free(d_X, x_size); | ||||||
|  |     ggml_cl_pool_free(d_D, d_size); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { | ||||||
|  |     GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); | ||||||
|  |     ggml_cl_add_f32(src0, src1, dst); | ||||||
|  | } | ||||||
|  |  | ||||||
| static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
|     const int64_t ne00 = src0->ne[0]; |     const int64_t ne00 = src0->ne[0]; | ||||||
|     const int64_t ne01 = src0->ne[1]; |     const int64_t ne01 = src0->ne[1]; | ||||||
|   | |||||||
| @@ -10,6 +10,7 @@ extern "C" { | |||||||
| GGML_API void ggml_cl_init(void); | GGML_API void ggml_cl_init(void); | ||||||
|  |  | ||||||
| GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); | GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); | ||||||
|  | GGML_API void   ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); | ||||||
| GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst); | GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst); | ||||||
| GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); | GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); | ||||||
| GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); | GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										11
									
								
								ggml.c
									
									
									
									
									
								
							| @@ -7207,6 +7207,17 @@ static void ggml_compute_forward_add_f32( | |||||||
|     const int ith = params->ith; |     const int ith = params->ith; | ||||||
|     const int nth = params->nth; |     const int nth = params->nth; | ||||||
|  |  | ||||||
|  | #ifdef GGML_USE_CLBLAST | ||||||
|  |     if (src1->backend == GGML_BACKEND_GPU) { | ||||||
|  |         // TODO: OpenCL kernel support full broadcast | ||||||
|  |         GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); | ||||||
|  |         if (ith == 0) { | ||||||
|  |             ggml_cl_add(src0, src1, dst); | ||||||
|  |         } | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|     const int nr  = ggml_nrows(src0); |     const int nr  = ggml_nrows(src0); | ||||||
|  |  | ||||||
|     GGML_TENSOR_BINARY_OP_LOCALS |     GGML_TENSOR_BINARY_OP_LOCALS | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 0cc4m
					0cc4m