mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	opencl: fix couple crashes (#12795)
* opencl: fix couple crashes * fix kernel launches failed on devices which do not support non-uniform work-groups. When non-uniform work-groups are not supported, set `local_work_size` to NULL (= let driver choose the work-group sizes). This patch does not cover everything - just the cases tested by test-backend-ops. * fix sub-buffer creation failed due to `cl_buffer_region::origin` not being aligned to `CL_DEVICE_MEM_BASE_ADDR_ALIGN`. * OpenCL: query non-uniform WG sizes only on OpenCL 3.0+
This commit is contained in:
		| @@ -74,6 +74,7 @@ struct ggml_cl_version { | ||||
|     cl_uint minor = 0; | ||||
| }; | ||||
|  | ||||
|  | ||||
| struct ggml_cl_compiler_version { | ||||
|     ADRENO_CL_COMPILER_TYPE type; | ||||
|     int major = -1; | ||||
| @@ -91,6 +92,14 @@ struct ggml_cl_compiler_version { | ||||
|     } | ||||
| }; | ||||
|  | ||||
| static size_t align_to(size_t value, size_t to_alignment) { | ||||
|     GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)"); | ||||
|     GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two"); | ||||
|  | ||||
|     return ((value + to_alignment - 1) / to_alignment) * to_alignment; | ||||
| } | ||||
|  | ||||
|  | ||||
| // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes. | ||||
| static ggml_cl_version parse_cl_version(std::string_view str) { | ||||
|     size_t major_str_begin = 0; | ||||
| @@ -248,6 +257,8 @@ struct ggml_backend_opencl_context { | ||||
|  | ||||
|     int adreno_wave_size; | ||||
|  | ||||
|     cl_bool non_uniform_workgroups; | ||||
|  | ||||
|     cl_context context; | ||||
|     cl_command_queue queue; | ||||
|  | ||||
| @@ -1397,6 +1408,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { | ||||
|     GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n", | ||||
|         svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false"); | ||||
|  | ||||
|     if (opencl_c_version.major >= 3) { | ||||
|         CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), | ||||
|                                  &backend_ctx->non_uniform_workgroups, 0)); | ||||
|     } else { | ||||
|         GGML_ASSERT(opencl_c_version.major == 2); | ||||
|         // Non-uniform workgroup sizes is mandatory feature in v2.x. | ||||
|         backend_ctx->non_uniform_workgroups = true; | ||||
|     } | ||||
|  | ||||
|     // Print out configurations | ||||
| #ifdef GGML_OPENCL_SOA_Q | ||||
|     GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n"); | ||||
| @@ -2058,15 +2078,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, | ||||
|         // The original tensor memory is divided into scales and quants, i.e., | ||||
|         // we first store scales, then quants. | ||||
|         // Create subbuffer for scales. | ||||
|         region.origin = extra_orig->offset + tensor->view_offs + offset; | ||||
|         region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); | ||||
|         region.size = size_d; | ||||
|         extra->d = clCreateSubBuffer( | ||||
|             extra_orig->data_device, CL_MEM_READ_WRITE, | ||||
|             CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); | ||||
|         CL_CHECK(err); | ||||
|         auto previous_origin = region.origin; | ||||
|  | ||||
|         // Create subbuffer for quants. | ||||
|         region.origin = extra_orig->offset + tensor->view_offs + offset + size_d; | ||||
|         region.origin = align_to(previous_origin + size_d, backend_ctx->alignment); | ||||
|         region.size = size_q; | ||||
|         extra->q = clCreateSubBuffer( | ||||
|             extra_orig->data_device, CL_MEM_READ_WRITE, | ||||
| @@ -2942,14 +2963,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const | ||||
|         size_t global_work_size[] = {(size_t)n, 1, 1}; | ||||
|         size_t local_work_size[] = {64, 1, 1}; | ||||
|  | ||||
|         size_t * local_work_size_ptr = local_work_size; | ||||
|         if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { | ||||
|             local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes. | ||||
|         } | ||||
|  | ||||
| #ifdef GGML_OPENCL_PROFILING | ||||
|         cl_event evt; | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); | ||||
|  | ||||
|         g_profiling_info.emplace_back(); | ||||
|         populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); | ||||
|         populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); | ||||
| #else | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); | ||||
| #endif | ||||
|     } else { | ||||
|         unsigned int nth = MIN(64, ne0); | ||||
| @@ -3077,14 +3103,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const | ||||
|         size_t global_work_size[] = {(size_t)n, 1, 1}; | ||||
|         size_t local_work_size[] = {64, 1, 1}; | ||||
|  | ||||
|         size_t * local_work_size_ptr = local_work_size; | ||||
|         if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { | ||||
|             local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes. | ||||
|         } | ||||
|  | ||||
| #ifdef GGML_OPENCL_PROFILING | ||||
|         cl_event evt; | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); | ||||
|  | ||||
|         g_profiling_info.emplace_back(); | ||||
|         populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); | ||||
|         populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); | ||||
| #else | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); | ||||
| #endif | ||||
|     } else { | ||||
|         unsigned int nth = MIN(64, ne0); | ||||
| @@ -3233,14 +3264,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const | ||||
|     size_t global_work_size[] = {(size_t)n, 1, 1}; | ||||
|     size_t local_work_size[] = {64, 1, 1}; | ||||
|  | ||||
|     size_t * local_work_size_ptr = local_work_size; | ||||
|     if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { | ||||
|         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes. | ||||
|     } | ||||
|  | ||||
| #ifdef GGML_OPENCL_PROFILING | ||||
|     cl_event evt; | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); | ||||
|  | ||||
|     g_profiling_info.emplace_back(); | ||||
|     populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); | ||||
|     populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); | ||||
| #else | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| @@ -3273,14 +3309,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const | ||||
|     size_t global_work_size[] = {(size_t)n, 1, 1}; | ||||
|     size_t local_work_size[] = {64, 1, 1}; | ||||
|  | ||||
|     size_t * local_work_size_ptr = local_work_size; | ||||
|     if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { | ||||
|         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes. | ||||
|     } | ||||
|  | ||||
| #ifdef GGML_OPENCL_PROFILING | ||||
|     cl_event evt; | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); | ||||
|  | ||||
|     g_profiling_info.emplace_back(); | ||||
|     populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); | ||||
|     populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); | ||||
| #else | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| @@ -3320,14 +3361,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons | ||||
|     size_t global_work_size[] = {(size_t)n, 1, 1}; | ||||
|     size_t local_work_size[] = {64, 1, 1}; | ||||
|  | ||||
|     size_t * local_work_size_ptr = local_work_size; | ||||
|     if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { | ||||
|         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes. | ||||
|     } | ||||
|  | ||||
| #ifdef GGML_OPENCL_PROFILING | ||||
|     cl_event evt; | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); | ||||
|  | ||||
|     g_profiling_info.emplace_back(); | ||||
|     populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); | ||||
|     populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); | ||||
| #else | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| @@ -4230,14 +4276,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons | ||||
|     size_t global_work_size[] = {(size_t)n, 1, 1}; | ||||
|     size_t local_work_size[] = {64, 1, 1}; | ||||
|  | ||||
|     size_t * local_work_size_ptr = local_work_size; | ||||
|     if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { | ||||
|         local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes. | ||||
|     } | ||||
|  | ||||
| #ifdef GGML_OPENCL_PROFILING | ||||
|     cl_event evt; | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); | ||||
|  | ||||
|     g_profiling_info.emplace_back(); | ||||
|     populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); | ||||
|     populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); | ||||
| #else | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); | ||||
|     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); | ||||
| #endif | ||||
| } | ||||
|  | ||||
| @@ -4418,14 +4469,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr | ||||
|         size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02}; | ||||
|         size_t local_work_size[] = {64, 1, 1}; | ||||
|  | ||||
|         size_t * local_work_size_ptr = local_work_size; | ||||
|         if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) { | ||||
|             local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes. | ||||
|         } | ||||
|  | ||||
| #ifdef GGML_OPENCL_PROFILING | ||||
|         cl_event evt; | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); | ||||
|  | ||||
|         g_profiling_info.emplace_back(); | ||||
|         populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); | ||||
|         populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); | ||||
| #else | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); | ||||
|         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); | ||||
| #endif | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Henry Linjamäki
					Henry Linjamäki