mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	OpenCL: Add concat, tsembd, upscale, tanh, pad and repeat (#13840)
* add concat, pad, repeat, tsembd, tanh, upscale * small fixes
This commit is contained in:
		| @@ -95,6 +95,12 @@ set(GGML_OPENCL_KERNELS | |||||||
|     sub |     sub | ||||||
|     sum_rows |     sum_rows | ||||||
|     transpose |     transpose | ||||||
|  |     concat | ||||||
|  |     tsembd | ||||||
|  |     upscale | ||||||
|  |     tanh | ||||||
|  |     pad | ||||||
|  |     repeat | ||||||
| ) | ) | ||||||
|  |  | ||||||
| foreach (K ${GGML_OPENCL_KERNELS}) | foreach (K ${GGML_OPENCL_KERNELS}) | ||||||
|   | |||||||
| @@ -315,6 +315,12 @@ struct ggml_backend_opencl_context { | |||||||
|     cl_program program_softmax_4_f16; |     cl_program program_softmax_4_f16; | ||||||
|     cl_program program_argsort_f32_i32; |     cl_program program_argsort_f32_i32; | ||||||
|     cl_program program_sum_rows_f32; |     cl_program program_sum_rows_f32; | ||||||
|  |     cl_program program_repeat; | ||||||
|  |     cl_program program_pad; | ||||||
|  |     cl_program program_tanh; | ||||||
|  |     cl_program program_upscale; | ||||||
|  |     cl_program program_concat; | ||||||
|  |     cl_program program_tsembd; | ||||||
|  |  | ||||||
|     cl_kernel kernel_add, kernel_add_row; |     cl_kernel kernel_add, kernel_add_row; | ||||||
|     cl_kernel kernel_mul, kernel_mul_row; |     cl_kernel kernel_mul, kernel_mul_row; | ||||||
| @@ -351,6 +357,15 @@ struct ggml_backend_opencl_context { | |||||||
|     cl_kernel kernel_im2col_f32, kernel_im2col_f16; |     cl_kernel kernel_im2col_f32, kernel_im2col_f16; | ||||||
|     cl_kernel kernel_argsort_f32_i32; |     cl_kernel kernel_argsort_f32_i32; | ||||||
|     cl_kernel kernel_sum_rows_f32; |     cl_kernel kernel_sum_rows_f32; | ||||||
|  |     cl_kernel kernel_repeat; | ||||||
|  |     cl_kernel kernel_pad; | ||||||
|  |     cl_kernel kernel_tanh_f32_nd; | ||||||
|  |     cl_kernel kernel_tanh_f16_nd; | ||||||
|  |     cl_kernel kernel_upscale; | ||||||
|  |     cl_kernel kernel_upscale_bilinear; | ||||||
|  |     cl_kernel kernel_concat_f32_contiguous; | ||||||
|  |     cl_kernel kernel_concat_f32_non_contiguous; | ||||||
|  |     cl_kernel kernel_timestep_embedding; | ||||||
|  |  | ||||||
| #ifdef GGML_OPENCL_USE_ADRENO_KERNELS | #ifdef GGML_OPENCL_USE_ADRENO_KERNELS | ||||||
|     // Transpose kernels |     // Transpose kernels | ||||||
| @@ -1097,6 +1112,150 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve | |||||||
|         GGML_LOG_CONT("."); |         GGML_LOG_CONT("."); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |         // repeat | ||||||
|  |     { | ||||||
|  | #ifdef GGML_OPENCL_EMBED_KERNELS | ||||||
|  |         const std::string kernel_src { | ||||||
|  |             #include "repeat.cl.h" | ||||||
|  |         }; | ||||||
|  | #else | ||||||
|  |         const std::string kernel_src = read_file("repeat.cl"); | ||||||
|  | #endif | ||||||
|  |         if (!kernel_src.empty()) { | ||||||
|  |             backend_ctx->program_repeat = | ||||||
|  |                 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); | ||||||
|  |             CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err)); | ||||||
|  |             GGML_LOG_CONT("."); | ||||||
|  |         } else { | ||||||
|  |             GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n"); | ||||||
|  |             backend_ctx->program_repeat = nullptr; | ||||||
|  |             backend_ctx->kernel_repeat = nullptr; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // pad | ||||||
|  |     { | ||||||
|  | #ifdef GGML_OPENCL_EMBED_KERNELS | ||||||
|  |         const std::string kernel_src { | ||||||
|  |             #include "pad.cl.h" | ||||||
|  |         }; | ||||||
|  | #else | ||||||
|  |         const std::string kernel_src = read_file("pad.cl"); | ||||||
|  | #endif | ||||||
|  |         if (!kernel_src.empty()) { | ||||||
|  |             backend_ctx->program_pad = | ||||||
|  |                 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); | ||||||
|  |             CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err)); | ||||||
|  |             GGML_LOG_CONT("."); | ||||||
|  |         } else { | ||||||
|  |             GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n"); | ||||||
|  |             backend_ctx->program_pad = nullptr; | ||||||
|  |             backend_ctx->kernel_pad = nullptr; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // tanh | ||||||
|  |     { | ||||||
|  | #ifdef GGML_OPENCL_EMBED_KERNELS | ||||||
|  |         const std::string kernel_src { | ||||||
|  |             #include "tanh.cl.h" | ||||||
|  |         }; | ||||||
|  | #else | ||||||
|  |         const std::string kernel_src = read_file("tanh.cl"); | ||||||
|  | #endif | ||||||
|  |         if (!kernel_src.empty()) { | ||||||
|  |             backend_ctx->program_tanh = | ||||||
|  |                 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); | ||||||
|  |             CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err)); | ||||||
|  |             CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err)); | ||||||
|  |             GGML_LOG_CONT("."); | ||||||
|  |         } else { | ||||||
|  |             GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n"); | ||||||
|  |             backend_ctx->program_tanh = nullptr; | ||||||
|  |             backend_ctx->kernel_tanh_f32_nd = nullptr; | ||||||
|  |             backend_ctx->kernel_tanh_f16_nd = nullptr; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // upscale | ||||||
|  |     { | ||||||
|  | #ifdef GGML_OPENCL_EMBED_KERNELS | ||||||
|  |         const std::string kernel_src { | ||||||
|  |             #include "upscale.cl.h" | ||||||
|  |         }; | ||||||
|  | #else | ||||||
|  |         const std::string kernel_src = read_file("upscale.cl"); | ||||||
|  | #endif | ||||||
|  |         if (!kernel_src.empty()) { | ||||||
|  |             backend_ctx->program_upscale = | ||||||
|  |                 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); | ||||||
|  |             CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err)); | ||||||
|  |             if (backend_ctx->program_upscale) { | ||||||
|  |                  cl_int err_bilinear; | ||||||
|  |                  backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear); | ||||||
|  |                  if (err_bilinear != CL_SUCCESS) { | ||||||
|  |                     GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear); | ||||||
|  |                     backend_ctx->kernel_upscale_bilinear = nullptr; | ||||||
|  |                  } | ||||||
|  |             } else { | ||||||
|  |                 backend_ctx->kernel_upscale_bilinear = nullptr; | ||||||
|  |             } | ||||||
|  |             GGML_LOG_CONT("."); | ||||||
|  |         } else { | ||||||
|  |             GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n"); | ||||||
|  |             backend_ctx->program_upscale = nullptr; | ||||||
|  |             backend_ctx->kernel_upscale = nullptr; | ||||||
|  |             backend_ctx->kernel_upscale_bilinear = nullptr; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // concat | ||||||
|  |     { | ||||||
|  | #ifdef GGML_OPENCL_EMBED_KERNELS | ||||||
|  |         const std::string kernel_src { | ||||||
|  |             #include "concat.cl.h" | ||||||
|  |         }; | ||||||
|  | #else | ||||||
|  |  | ||||||
|  |         const std::string kernel_src = read_file("concat.cl"); | ||||||
|  | #endif | ||||||
|  |         if (!kernel_src.empty()) { | ||||||
|  |             backend_ctx->program_concat = | ||||||
|  |                 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); | ||||||
|  |  | ||||||
|  |             CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err)); | ||||||
|  |             CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err)); | ||||||
|  |             GGML_LOG_CONT("."); | ||||||
|  |         } else { | ||||||
|  |             GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n"); | ||||||
|  |             backend_ctx->program_concat = nullptr; | ||||||
|  |             backend_ctx->kernel_concat_f32_contiguous = nullptr; | ||||||
|  |             backend_ctx->kernel_concat_f32_non_contiguous = nullptr; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // timestep_embedding | ||||||
|  |     { | ||||||
|  | #ifdef GGML_OPENCL_EMBED_KERNELS | ||||||
|  |         const std::string kernel_src { | ||||||
|  |             #include "tsembd.cl.h" | ||||||
|  |         }; | ||||||
|  | #else | ||||||
|  |  | ||||||
|  |         const std::string kernel_src = read_file("tsembd.cl"); | ||||||
|  | #endif | ||||||
|  |         if (!kernel_src.empty()) { | ||||||
|  |             backend_ctx->program_tsembd = | ||||||
|  |                 build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); | ||||||
|  |             CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err)); | ||||||
|  |             GGML_LOG_CONT("."); | ||||||
|  |         } else { | ||||||
|  |             GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n"); | ||||||
|  |             backend_ctx->program_tsembd = nullptr; | ||||||
|  |             backend_ctx->kernel_timestep_embedding = nullptr; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|     // Adreno kernels |     // Adreno kernels | ||||||
| #ifdef GGML_OPENCL_USE_ADRENO_KERNELS | #ifdef GGML_OPENCL_USE_ADRENO_KERNELS | ||||||
|     // transpose |     // transpose | ||||||
| @@ -1979,6 +2138,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te | |||||||
|                    return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; |                    return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; | ||||||
|                 case GGML_UNARY_OP_SIGMOID: |                 case GGML_UNARY_OP_SIGMOID: | ||||||
|                     return ggml_is_contiguous(op->src[0]); |                     return ggml_is_contiguous(op->src[0]); | ||||||
|  |                 case GGML_UNARY_OP_TANH: | ||||||
|  |                    return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) || | ||||||
|  |                           (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16); | ||||||
|                 default: |                 default: | ||||||
|                     return false; |                     return false; | ||||||
|             } |             } | ||||||
| @@ -1988,6 +2150,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te | |||||||
|         case GGML_OP_NORM: |         case GGML_OP_NORM: | ||||||
|         case GGML_OP_RMS_NORM: |         case GGML_OP_RMS_NORM: | ||||||
|             return true; |             return true; | ||||||
|  |         case GGML_OP_REPEAT: | ||||||
|  |             return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded | ||||||
|  |         case GGML_OP_PAD: | ||||||
|  |             return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 && | ||||||
|  |                    op->src[0]->ne[3] == 1 && op->ne[3] == 1; | ||||||
|  |         case GGML_OP_UPSCALE: | ||||||
|  |             return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; | ||||||
|  |         case GGML_OP_CONCAT: | ||||||
|  |             return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; | ||||||
|  |         case GGML_OP_TIMESTEP_EMBEDDING: | ||||||
|  |             return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; | ||||||
|         case GGML_OP_GROUP_NORM: |         case GGML_OP_GROUP_NORM: | ||||||
|             return ggml_is_contiguous(op->src[0]); |             return ggml_is_contiguous(op->src[0]); | ||||||
|         case GGML_OP_MUL_MAT: |         case GGML_OP_MUL_MAT: | ||||||
| @@ -4108,6 +4281,536 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, | |||||||
| #endif | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
|  | static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
|  |     GGML_ASSERT(src0); | ||||||
|  |     GGML_ASSERT(src0->extra); | ||||||
|  |     GGML_ASSERT(dst); | ||||||
|  |     GGML_ASSERT(dst->extra); | ||||||
|  |  | ||||||
|  |     UNUSED(src1); | ||||||
|  |  | ||||||
|  |     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; | ||||||
|  |     cl_command_queue queue = backend_ctx->queue; | ||||||
|  |  | ||||||
|  |     ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; | ||||||
|  |     ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; | ||||||
|  |  | ||||||
|  |     cl_ulong offset0_abs = extra0->offset + src0->view_offs; | ||||||
|  |     cl_ulong offsetd_abs = extrad->offset + dst->view_offs; | ||||||
|  |  | ||||||
|  |     cl_kernel kernel; | ||||||
|  |     if (dst->type == GGML_TYPE_F32) { | ||||||
|  |         kernel = backend_ctx->kernel_tanh_f32_nd; | ||||||
|  |     } else if (dst->type == GGML_TYPE_F16) { | ||||||
|  |         kernel = backend_ctx->kernel_tanh_f16_nd; | ||||||
|  |     } else { | ||||||
|  |         GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh"); | ||||||
|  |     } | ||||||
|  |     GGML_ASSERT(kernel != nullptr); | ||||||
|  |  | ||||||
|  |     const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3]; | ||||||
|  |     const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3]; | ||||||
|  |  | ||||||
|  |     const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3]; | ||||||
|  |     const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3]; | ||||||
|  |  | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),   &extra0->data_device)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),   &extrad->data_device)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs)); | ||||||
|  |  | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),      &ne00)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),      &ne01)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),      &ne02)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),      &ne03)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03)); | ||||||
|  |  | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),     &ne10)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),     &ne11)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),     &ne12)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),     &ne13)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13)); | ||||||
|  |  | ||||||
|  |     size_t global_work_size[3]; | ||||||
|  |     if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     global_work_size[0] = (size_t)ne10; | ||||||
|  |     global_work_size[1] = (size_t)ne11; | ||||||
|  |     global_work_size[2] = (size_t)ne12; | ||||||
|  |  | ||||||
|  |     size_t lws0 = 16, lws1 = 4, lws2 = 1; | ||||||
|  |     if (ne10 < 16) lws0 = ne10; | ||||||
|  |     if (ne11 < 4) lws1 = ne11; | ||||||
|  |     if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1; | ||||||
|  |  | ||||||
|  |     while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2; | ||||||
|  |     while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2; | ||||||
|  |     while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     size_t local_work_size[] = {lws0, lws1, lws2}; | ||||||
|  |  | ||||||
|  |     size_t* local_work_size_ptr = local_work_size; | ||||||
|  |     if (!backend_ctx->non_uniform_workgroups) { | ||||||
|  |         if (global_work_size[0] % local_work_size[0] != 0 || | ||||||
|  |             global_work_size[1] % local_work_size[1] != 0 || | ||||||
|  |             global_work_size[2] % local_work_size[2] != 0) { | ||||||
|  |             local_work_size_ptr = NULL; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #ifdef GGML_OPENCL_PROFILING | ||||||
|  |     cl_event evt; | ||||||
|  |     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); | ||||||
|  |  | ||||||
|  |     g_profiling_info.emplace_back(); | ||||||
|  |     populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst); | ||||||
|  | #else | ||||||
|  |     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) { | ||||||
|  |     GGML_ASSERT(src0); | ||||||
|  |     GGML_ASSERT(src0->extra); | ||||||
|  |     GGML_ASSERT(dst); | ||||||
|  |     GGML_ASSERT(dst->extra); | ||||||
|  |     GGML_ASSERT(dst->type == src0->type); | ||||||
|  |  | ||||||
|  |     UNUSED(src1_shape_def); | ||||||
|  |  | ||||||
|  |     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; | ||||||
|  |     cl_command_queue queue = backend_ctx->queue; | ||||||
|  |  | ||||||
|  |     if (backend_ctx->kernel_repeat == nullptr) { | ||||||
|  |         GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra; | ||||||
|  |     ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra; | ||||||
|  |  | ||||||
|  |     cl_ulong off_src0 = extra_src0->offset + src0->view_offs; | ||||||
|  |     cl_ulong off_dst  = extra_dst->offset  + dst->view_offs; | ||||||
|  |  | ||||||
|  |     const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3]; | ||||||
|  |     const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3]; | ||||||
|  |  | ||||||
|  |     const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3]; | ||||||
|  |     const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3]; | ||||||
|  |  | ||||||
|  |     cl_kernel kernel = backend_ctx->kernel_repeat; | ||||||
|  |  | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra_src0->data_device)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem),    &extra_dst->data_device)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong),  &off_src0)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_dst)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &src0_ne0)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),       &src0_ne1)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &src0_ne2)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),       &src0_ne3)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong),  &src0_nb0)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong),  &src0_nb1)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &dst_ne0)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &dst_ne1)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &dst_ne2)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &dst_ne3)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3)); | ||||||
|  |  | ||||||
|  |     size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1; | ||||||
|  |     size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1; | ||||||
|  |     size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1; | ||||||
|  |  | ||||||
|  |     size_t global_work_size[] = { gws0, gws1, gws2 }; | ||||||
|  |  | ||||||
|  | #ifdef GGML_OPENCL_PROFILING | ||||||
|  |     cl_event evt; | ||||||
|  |     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt)); | ||||||
|  |  | ||||||
|  |     g_profiling_info.emplace_back(); | ||||||
|  |     populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst); | ||||||
|  | #else | ||||||
|  |     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL)); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { | ||||||
|  |     GGML_ASSERT(src0); | ||||||
|  |     GGML_ASSERT(src0->extra); | ||||||
|  |     GGML_ASSERT(dst); | ||||||
|  |     GGML_ASSERT(dst->extra); | ||||||
|  |     GGML_ASSERT(src0->type == GGML_TYPE_F32); | ||||||
|  |     GGML_ASSERT(dst->type == GGML_TYPE_F32); | ||||||
|  |     GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); | ||||||
|  |  | ||||||
|  |     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; | ||||||
|  |     cl_command_queue queue = backend_ctx->queue; | ||||||
|  |  | ||||||
|  |     if (backend_ctx->kernel_pad == nullptr) { | ||||||
|  |         GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra; | ||||||
|  |     ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra; | ||||||
|  |  | ||||||
|  |     cl_ulong off_src0 = extra_src0->offset + src0->view_offs; | ||||||
|  |     cl_ulong off_dst  = extra_dst->offset  + dst->view_offs; | ||||||
|  |  | ||||||
|  |     const int s_ne0 = src0->ne[0]; | ||||||
|  |     const int s_ne1 = src0->ne[1]; | ||||||
|  |     const int s_ne2 = src0->ne[2]; | ||||||
|  |  | ||||||
|  |     const int d_ne0 = dst->ne[0]; | ||||||
|  |     const int d_ne1 = dst->ne[1]; | ||||||
|  |     const int d_ne2 = dst->ne[2]; | ||||||
|  |  | ||||||
|  |     cl_kernel kernel = backend_ctx->kernel_pad; | ||||||
|  |  | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra_src0->data_device)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &off_src0)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra_dst->data_device)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_dst)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &s_ne0)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),       &s_ne1)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &s_ne2)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),       &d_ne0)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),       &d_ne1)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),       &d_ne2)); | ||||||
|  |  | ||||||
|  |     size_t lws0 = 64; | ||||||
|  |     size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0; | ||||||
|  |  | ||||||
|  |     size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 }; | ||||||
|  |     size_t local_work_size[]  = { lws0, 1, 1 }; | ||||||
|  |  | ||||||
|  |     size_t * local_work_size_ptr = local_work_size; | ||||||
|  |      if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) { | ||||||
|  |         local_work_size_ptr = nullptr; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | #ifdef GGML_OPENCL_PROFILING | ||||||
|  |     cl_event evt; | ||||||
|  |     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); | ||||||
|  |  | ||||||
|  |     g_profiling_info.emplace_back(); | ||||||
|  |     populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst); | ||||||
|  | #else | ||||||
|  |     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { | ||||||
|  |     GGML_ASSERT(src0); | ||||||
|  |     GGML_ASSERT(src0->extra); | ||||||
|  |     GGML_ASSERT(dst); | ||||||
|  |     GGML_ASSERT(dst->extra); | ||||||
|  |     GGML_ASSERT(src0->type == GGML_TYPE_F32); | ||||||
|  |     GGML_ASSERT(dst->type == GGML_TYPE_F32); | ||||||
|  |  | ||||||
|  |     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; | ||||||
|  |     cl_command_queue queue = backend_ctx->queue; | ||||||
|  |  | ||||||
|  |     const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0); | ||||||
|  |     cl_kernel kernel = nullptr; | ||||||
|  |  | ||||||
|  |     if (mode == GGML_SCALE_MODE_NEAREST) { | ||||||
|  |         kernel = backend_ctx->kernel_upscale; | ||||||
|  |         if (kernel == nullptr) { | ||||||
|  |             GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__); | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |     } else if (mode == GGML_SCALE_MODE_BILINEAR) { | ||||||
|  |         kernel = backend_ctx->kernel_upscale_bilinear; | ||||||
|  |         if (kernel == nullptr) { | ||||||
|  |             GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__); | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |     } else { | ||||||
|  |         GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra; | ||||||
|  |     ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra; | ||||||
|  |  | ||||||
|  |     cl_ulong off_src0 = extra_src0->offset + src0->view_offs; | ||||||
|  |     cl_ulong off_dst  = extra_dst->offset  + dst->view_offs; | ||||||
|  |  | ||||||
|  |     const cl_ulong nb00 = src0->nb[0]; | ||||||
|  |     const cl_ulong nb01 = src0->nb[1]; | ||||||
|  |     const cl_ulong nb02 = src0->nb[2]; | ||||||
|  |     const cl_ulong nb03 = src0->nb[3]; | ||||||
|  |  | ||||||
|  |     const int ne00_src = src0->ne[0]; | ||||||
|  |     const int ne01_src = src0->ne[1]; | ||||||
|  |  | ||||||
|  |     const int ne10_dst = dst->ne[0]; | ||||||
|  |     const int ne11_dst = dst->ne[1]; | ||||||
|  |     const int ne12_dst = dst->ne[2]; | ||||||
|  |     const int ne13_dst = dst->ne[3]; | ||||||
|  |  | ||||||
|  |     const float sf0 = (float)dst->ne[0] / src0->ne[0]; | ||||||
|  |     const float sf1 = (float)dst->ne[1] / src0->ne[1]; | ||||||
|  |     const float sf2 = (float)dst->ne[2] / src0->ne[2]; | ||||||
|  |     const float sf3 = (float)dst->ne[3] / src0->ne[3]; | ||||||
|  |  | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra_src0->data_device)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &off_src0)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra_dst->data_device)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_dst)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong),  &nb00)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &nb01)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong),  &nb02)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong),  &nb03)); | ||||||
|  |  | ||||||
|  |     if (mode == GGML_SCALE_MODE_NEAREST) { | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),       &ne10_dst)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),       &ne11_dst)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12_dst)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne13_dst)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float),    &sf0)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float),    &sf1)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float),    &sf2)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float),    &sf3)); | ||||||
|  |     } else if (mode == GGML_SCALE_MODE_BILINEAR) { | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),       &ne00_src)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),       &ne01_src)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne10_dst)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne11_dst)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne12_dst)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne13_dst)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float),    &sf0)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float),    &sf1)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float),    &sf2)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float),    &sf3)); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst; | ||||||
|  |     if (dst_total_elements == 0) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |     size_t global_work_size[] = { dst_total_elements, 1, 1 }; | ||||||
|  |     size_t local_work_size_pref = 256; | ||||||
|  |     size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1}; | ||||||
|  |  | ||||||
|  |     size_t * local_work_size_ptr = local_work_size; | ||||||
|  |     if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) { | ||||||
|  |         local_work_size_ptr = nullptr; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | #ifdef GGML_OPENCL_PROFILING | ||||||
|  |     cl_event evt; | ||||||
|  |     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); | ||||||
|  |  | ||||||
|  |     g_profiling_info.emplace_back(); | ||||||
|  |     size_t profiling_gws[3] = {global_work_size[0], 1, 1}; | ||||||
|  |     size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1}; | ||||||
|  |     populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst); | ||||||
|  | #else | ||||||
|  |     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
|  |     GGML_ASSERT(src0); | ||||||
|  |     GGML_ASSERT(src0->extra); | ||||||
|  |     GGML_ASSERT(src1); | ||||||
|  |     GGML_ASSERT(src1->extra); | ||||||
|  |     GGML_ASSERT(dst); | ||||||
|  |     GGML_ASSERT(dst->extra); | ||||||
|  |     GGML_ASSERT(src0->type == GGML_TYPE_F32); | ||||||
|  |     GGML_ASSERT(src1->type == GGML_TYPE_F32); | ||||||
|  |     GGML_ASSERT(dst->type == GGML_TYPE_F32); | ||||||
|  |  | ||||||
|  |     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; | ||||||
|  |     cl_command_queue queue = backend_ctx->queue; | ||||||
|  |  | ||||||
|  |     if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) { | ||||||
|  |         GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra; | ||||||
|  |     ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra; | ||||||
|  |     ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra; | ||||||
|  |  | ||||||
|  |     cl_ulong off_src0 = extra0_cl->offset + src0->view_offs; | ||||||
|  |     cl_ulong off_src1 = extra1_cl->offset + src1->view_offs; | ||||||
|  |     cl_ulong off_dst  = extrad_cl->offset + dst->view_offs; | ||||||
|  |  | ||||||
|  |     const int32_t dim = ((const int32_t *) dst->op_params)[0]; | ||||||
|  |     GGML_ASSERT(dim >= 0 && dim <= 3); | ||||||
|  |  | ||||||
|  |     if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) { | ||||||
|  |         if (dim == 3) { | ||||||
|  |  | ||||||
|  |             size_t nbytes_src0 = ggml_nbytes(src0); | ||||||
|  |             size_t nbytes_src1 = ggml_nbytes(src1); | ||||||
|  |  | ||||||
|  |             CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device, | ||||||
|  |                                          off_src0, off_dst, nbytes_src0, 0, NULL, NULL)); | ||||||
|  |             CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device, | ||||||
|  |                                          off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL)); | ||||||
|  |         } else { | ||||||
|  |  | ||||||
|  |             cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous; | ||||||
|  |             size_t global_work_size[3]; | ||||||
|  |  | ||||||
|  |             for (int i3 = 0; i3 < dst->ne[3]; ++i3) { | ||||||
|  |                 cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]); | ||||||
|  |                 cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]); | ||||||
|  |                 cl_ulong current_off_dst  = off_dst  + (i3 * dst->nb[3]); | ||||||
|  |  | ||||||
|  |                 int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2]; | ||||||
|  |                 int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2]; | ||||||
|  |                 int d_ne0  = dst->ne[0];  int d_ne1  = dst->ne[1];  int d_ne2  = dst->ne[2]; | ||||||
|  |  | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra0_cl->data_device)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  ¤t_off_src0)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra1_cl->data_device)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  ¤t_off_src1)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),    &extrad_cl->data_device)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  ¤t_off_dst)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &d_ne00)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int),       &d_ne01)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int),       &d_ne02)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int),       &d_ne10)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &d_ne11)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &d_ne12)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &d_ne0)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &d_ne1)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &d_ne2)); | ||||||
|  |                 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &dim)); | ||||||
|  |  | ||||||
|  |                 global_work_size[0] = d_ne0; | ||||||
|  |                 global_work_size[1] = d_ne1; | ||||||
|  |                 global_work_size[2] = d_ne2; | ||||||
|  |  | ||||||
|  |                 CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL)); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } else { | ||||||
|  |         cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous; | ||||||
|  |  | ||||||
|  |         long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3]; | ||||||
|  |         cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3]; | ||||||
|  |  | ||||||
|  |         cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3]; | ||||||
|  |  | ||||||
|  |         long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3]; | ||||||
|  |         cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3]; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra0_cl->data_device)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &off_src0)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra1_cl->data_device)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_src1)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem),    &extrad_cl->data_device)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong),  &off_dst)); | ||||||
|  |  | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 6, sizeof(long),      &ne00)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 7, sizeof(long),      &ne01)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 8, sizeof(long),      &ne02)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 9, sizeof(long),      &ne03)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),    &nb00)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),    &nb01)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong),    &nb02)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong),    &nb03)); | ||||||
|  |  | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong),    &nb10)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong),    &nb11)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),    &nb12)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),    &nb13)); | ||||||
|  |  | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 18, sizeof(long),     &d_ne0)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 19, sizeof(long),     &d_ne1)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 20, sizeof(long),     &d_ne2)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 21, sizeof(long),     &d_ne3)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong),    &d_nb0)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong),    &d_nb1)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong),    &d_nb2)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong),    &d_nb3)); | ||||||
|  |         CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int),      &dim)); | ||||||
|  |  | ||||||
|  |         size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1, | ||||||
|  |                                          d_ne2 > 0 ? (size_t)d_ne2 : 1, | ||||||
|  |                                          d_ne3 > 0 ? (size_t)d_ne3 : 1 }; | ||||||
|  |  | ||||||
|  |         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size_nc, NULL, 0, NULL, NULL)); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { | ||||||
|  |     GGML_ASSERT(src0); | ||||||
|  |     GGML_ASSERT(src0->extra); | ||||||
|  |     GGML_ASSERT(dst); | ||||||
|  |     GGML_ASSERT(dst->extra); | ||||||
|  |     GGML_ASSERT(src0->type == GGML_TYPE_F32); | ||||||
|  |     GGML_ASSERT(dst->type == GGML_TYPE_F32); | ||||||
|  |  | ||||||
|  |     ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; | ||||||
|  |     cl_command_queue queue = backend_ctx->queue; | ||||||
|  |  | ||||||
|  |     if (backend_ctx->kernel_timestep_embedding == nullptr) { | ||||||
|  |         GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra; | ||||||
|  |     ggml_tensor_extra_cl * extra_dst  = (ggml_tensor_extra_cl *)dst->extra; | ||||||
|  |  | ||||||
|  |     cl_ulong off_src0 = extra_src0->offset + src0->view_offs; | ||||||
|  |     cl_ulong off_dst  = extra_dst->offset  + dst->view_offs; | ||||||
|  |  | ||||||
|  |     const int logical_dim = dst->op_params[0]; | ||||||
|  |     const int max_period  = dst->op_params[1]; | ||||||
|  |     const int dst_nb1_bytes = dst->nb[1]; | ||||||
|  |  | ||||||
|  |     cl_kernel kernel = backend_ctx->kernel_timestep_embedding; | ||||||
|  |  | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem),    &extra_src0->data_device)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong),  &off_src0)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem),    &extra_dst->data_device)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong),  &off_dst)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int),       &dst_nb1_bytes)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int),       &logical_dim)); | ||||||
|  |     CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int),       &max_period)); | ||||||
|  |  | ||||||
|  |     size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1); | ||||||
|  |  | ||||||
|  |     size_t gws1 = (size_t)src0->ne[0]; | ||||||
|  |  | ||||||
|  |     size_t global_work_size[] = {gws0, gws1, 1}; | ||||||
|  |  | ||||||
|  | #ifdef GGML_OPENCL_PROFILING | ||||||
|  |     cl_event evt; | ||||||
|  |     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem | ||||||
|  |  | ||||||
|  |     g_profiling_info.emplace_back(); | ||||||
|  |     size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1}; | ||||||
|  |     size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS | ||||||
|  |     populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst); | ||||||
|  | #else | ||||||
|  |     CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL)); // Pass 2 for 2D problem | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
| static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||||||
|     GGML_ASSERT(src0); |     GGML_ASSERT(src0); | ||||||
|     GGML_ASSERT(src0->extra); |     GGML_ASSERT(src0->extra); | ||||||
| @@ -5667,6 +6370,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor | |||||||
|                     } |                     } | ||||||
|                     func = ggml_cl_sigmoid; |                     func = ggml_cl_sigmoid; | ||||||
|                     break; |                     break; | ||||||
|  |                 case GGML_UNARY_OP_TANH: | ||||||
|  |                     if (!any_on_device) { | ||||||
|  |                         return false; | ||||||
|  |                     } | ||||||
|  |                     func = ggml_cl_tanh; | ||||||
|  |                     break; | ||||||
|                 default: |                 default: | ||||||
|                     return false; |                     return false; | ||||||
|             } break; |             } break; | ||||||
| @@ -5694,6 +6403,36 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor | |||||||
|             } |             } | ||||||
|             func = ggml_cl_group_norm; |             func = ggml_cl_group_norm; | ||||||
|             break; |             break; | ||||||
|  |                 case GGML_OP_REPEAT: | ||||||
|  |              if (!any_on_device) { | ||||||
|  |                 return false; | ||||||
|  |             } | ||||||
|  |             func = ggml_cl_repeat; | ||||||
|  |             break; | ||||||
|  |         case GGML_OP_PAD: | ||||||
|  |             if (!any_on_device) { | ||||||
|  |                 return false; | ||||||
|  |             } | ||||||
|  |             ggml_cl_pad(backend, tensor->src[0], tensor); | ||||||
|  |             return true; | ||||||
|  |         case GGML_OP_UPSCALE: | ||||||
|  |             if (!any_on_device) { | ||||||
|  |                 return false; | ||||||
|  |             } | ||||||
|  |             ggml_cl_upscale(backend, tensor->src[0], tensor); | ||||||
|  |             return true; | ||||||
|  |         case GGML_OP_CONCAT: | ||||||
|  |             if (!any_on_device) { | ||||||
|  |                 return false; | ||||||
|  |             } | ||||||
|  |             func = ggml_cl_concat; | ||||||
|  |             break; | ||||||
|  |         case GGML_OP_TIMESTEP_EMBEDDING: | ||||||
|  |             if (!any_on_device) { | ||||||
|  |                 return false; | ||||||
|  |             } | ||||||
|  |             ggml_cl_timestep_embedding(backend, tensor->src[0], tensor); | ||||||
|  |             return true; | ||||||
|         case GGML_OP_MUL_MAT: |         case GGML_OP_MUL_MAT: | ||||||
|             if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { |             if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { | ||||||
|                 return false; |                 return false; | ||||||
|   | |||||||
							
								
								
									
										109
									
								
								ggml/src/ggml-opencl/kernels/concat.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								ggml/src/ggml-opencl/kernels/concat.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,109 @@ | |||||||
|  | kernel void kernel_concat_f32_contiguous( | ||||||
|  |     global const char * p_src0, ulong off_src0, | ||||||
|  |     global const char * p_src1, ulong off_src1, | ||||||
|  |     global char * p_dst, ulong off_dst, | ||||||
|  |     int d_ne00, int d_ne01, int d_ne02, // src0->ne[0..2] for the slice | ||||||
|  |     int d_ne10, int d_ne11, int d_ne12, // src1->ne[0..2] for the slice (d_ne1X must match d_ne0X on non-concat axes) | ||||||
|  |     int d_ne0,  int d_ne1,  int d_ne2,  // dst->ne[0..2] for the slice | ||||||
|  |     int dim | ||||||
|  | ) { | ||||||
|  |     global const float * src0 = (global const float*)((global char*)p_src0 + off_src0); | ||||||
|  |     global const float * src1 = (global const float*)((global char*)p_src1 + off_src1); | ||||||
|  |     global float * dst        = (global float*)((global char*)p_dst + off_dst); | ||||||
|  |  | ||||||
|  |     int i0 = get_global_id(0); // Index along dst's 0th dimension | ||||||
|  |     int i1 = get_global_id(1); // Index along dst's 1st dimension | ||||||
|  |     int i2 = get_global_id(2); // Index along dst's 2nd dimension | ||||||
|  |  | ||||||
|  |     if (i0 >= d_ne0 || i1 >= d_ne1 || i2 >= d_ne2) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     ulong dst_idx = (ulong)i2 * d_ne0 * d_ne1 + (ulong)i1 * d_ne0 + i0; | ||||||
|  |     ulong src_idx; | ||||||
|  |  | ||||||
|  |     if (dim == 0) { | ||||||
|  |         if (i0 < d_ne00) { // Data from src0 | ||||||
|  |             src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0; | ||||||
|  |             dst[dst_idx] = src0[src_idx]; | ||||||
|  |         } else { // Data from src1 | ||||||
|  |             src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + (i0 - d_ne00); | ||||||
|  |             dst[dst_idx] = src1[src_idx]; | ||||||
|  |         } | ||||||
|  |     } else if (dim == 1) { | ||||||
|  |         if (i1 < d_ne01) { // Data from src0 | ||||||
|  |             src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0; | ||||||
|  |             dst[dst_idx] = src0[src_idx]; | ||||||
|  |         } else { // Data from src1 | ||||||
|  |             src_idx = (ulong)i2 * d_ne10 * d_ne11 + (ulong)(i1 - d_ne01) * d_ne10 + i0; | ||||||
|  |             dst[dst_idx] = src1[src_idx]; | ||||||
|  |         } | ||||||
|  |     } else if (dim == 2) { | ||||||
|  |         if (i2 < d_ne02) { // Data from src0 | ||||||
|  |             src_idx = (ulong)i2 * d_ne00 * d_ne01 + (ulong)i1 * d_ne00 + i0; | ||||||
|  |             dst[dst_idx] = src0[src_idx]; | ||||||
|  |         } else { // Data from src1 | ||||||
|  |  | ||||||
|  |             src_idx = (ulong)(i2 - d_ne02) * d_ne10 * d_ne11 + (ulong)i1 * d_ne10 + i0; | ||||||
|  |             dst[dst_idx] = src1[src_idx]; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | kernel void kernel_concat_f32_non_contiguous( | ||||||
|  |     global const char * p_src0, ulong off_src0, | ||||||
|  |     global const char * p_src1, ulong off_src1, | ||||||
|  |     global char * p_dst, ulong off_dst, | ||||||
|  |  | ||||||
|  |     long ne00, long ne01, long ne02, long ne03, | ||||||
|  |     ulong nb00, ulong nb01, ulong nb02, ulong nb03, | ||||||
|  |  | ||||||
|  |     ulong nb10, ulong nb11, ulong nb12, ulong nb13, // Strides for src1 | ||||||
|  |  | ||||||
|  |     long d_ne0, long d_ne1, long d_ne2, long d_ne3, | ||||||
|  |     ulong d_nb0, ulong d_nb1, ulong d_nb2, ulong d_nb3, | ||||||
|  |     int dim | ||||||
|  | ) { | ||||||
|  |     global const char * src0_base = p_src0 + off_src0; | ||||||
|  |     global const char * src1_base = p_src1 + off_src1; | ||||||
|  |     global char * dst_base        = p_dst + off_dst; | ||||||
|  |  | ||||||
|  |     long current_i1 = get_global_id(0); // Index for dst_dim_1 | ||||||
|  |     long current_i2 = get_global_id(1); // Index for dst_dim_2 | ||||||
|  |     long current_i3 = get_global_id(2); // Index for dst_dim_3 | ||||||
|  |  | ||||||
|  |     if (current_i1 >= d_ne1 || current_i2 >= d_ne2 || current_i3 >= d_ne3) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     global const float * x_val_ptr; | ||||||
|  |     global float * y_val_ptr; | ||||||
|  |  | ||||||
|  |     for (long current_i0 = 0; current_i0 < d_ne0; ++current_i0) { | ||||||
|  |         bool use_src0; | ||||||
|  |         long s_i0 = current_i0, s_i1 = current_i1, s_i2 = current_i2, s_i3 = current_i3; | ||||||
|  |  | ||||||
|  |         if (dim == 0) { | ||||||
|  |             use_src0 = (current_i0 < ne00); | ||||||
|  |             if (!use_src0) { s_i0 = current_i0 - ne00; } | ||||||
|  |         } else if (dim == 1) { | ||||||
|  |             use_src0 = (current_i1 < ne01); | ||||||
|  |             if (!use_src0) { s_i1 = current_i1 - ne01; } | ||||||
|  |         } else if (dim == 2) { | ||||||
|  |             use_src0 = (current_i2 < ne02); | ||||||
|  |             if (!use_src0) { s_i2 = current_i2 - ne02; } | ||||||
|  |         } else { // dim == 3 | ||||||
|  |             use_src0 = (current_i3 < ne03); | ||||||
|  |             if (!use_src0) { s_i3 = current_i3 - ne03; } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         if (use_src0) { | ||||||
|  |             x_val_ptr = (global const float *)(src0_base + (ulong)s_i3*nb03 + (ulong)s_i2*nb02 + (ulong)s_i1*nb01 + (ulong)s_i0*nb00); | ||||||
|  |         } else { | ||||||
|  |             x_val_ptr = (global const float *)(src1_base + (ulong)s_i3*nb13 + (ulong)s_i2*nb12 + (ulong)s_i1*nb11 + (ulong)s_i0*nb10); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         y_val_ptr = (global float *)(dst_base + (ulong)current_i3*d_nb3 + (ulong)current_i2*d_nb2 + (ulong)current_i1*d_nb1 + (ulong)current_i0*d_nb0); | ||||||
|  |         *y_val_ptr = *x_val_ptr; | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										30
									
								
								ggml/src/ggml-opencl/kernels/pad.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								ggml/src/ggml-opencl/kernels/pad.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | |||||||
|  | kernel void kernel_pad( | ||||||
|  |         global const void * src0_ptr, | ||||||
|  |         ulong src0_offset, | ||||||
|  |         global void * dst_ptr, | ||||||
|  |         ulong dst_offset, | ||||||
|  |         int s_ne0, int s_ne1, int s_ne2, | ||||||
|  |         int d_ne0, int d_ne1, int d_ne2 | ||||||
|  | ) { | ||||||
|  |     global const float * src0 = (global const float *)((global const char *)src0_ptr + src0_offset); | ||||||
|  |     global float * dst = (global float *)((global char *)dst_ptr + dst_offset); | ||||||
|  |  | ||||||
|  |     int nidx   = get_global_id(0); | ||||||
|  |     int idx_d1 = get_group_id(1); | ||||||
|  |     int idx_d2 = get_group_id(2); | ||||||
|  |  | ||||||
|  |     if (nidx >= d_ne0) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     int dst_el_offset = nidx + idx_d1 * d_ne0 + idx_d2 * d_ne0 * d_ne1; | ||||||
|  |  | ||||||
|  |     bool in_src_bounds = (nidx < s_ne0) && (idx_d1 < s_ne1) && (idx_d2 < s_ne2); | ||||||
|  |  | ||||||
|  |     if (in_src_bounds) { | ||||||
|  |         int src_el_offset = nidx + idx_d1 * s_ne0 + idx_d2 * s_ne0 * s_ne1; | ||||||
|  |         dst[dst_el_offset] = src0[src_el_offset]; | ||||||
|  |     } else { | ||||||
|  |         dst[dst_el_offset] = 0.0f; | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										39
									
								
								ggml/src/ggml-opencl/kernels/repeat.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								ggml/src/ggml-opencl/kernels/repeat.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,39 @@ | |||||||
|  | kernel void kernel_repeat( | ||||||
|  |     global const char * src0_data_in, | ||||||
|  |     global       char * dst_data_in, | ||||||
|  |     ulong src0_offset, | ||||||
|  |     ulong dst_offset, | ||||||
|  |     int src0_ne0, int src0_ne1, int src0_ne2, int src0_ne3, | ||||||
|  |     ulong src0_nb0, ulong src0_nb1, ulong src0_nb2, ulong src0_nb3, | ||||||
|  |     int dst_ne0, int dst_ne1, int dst_ne2, int dst_ne3, | ||||||
|  |     ulong dst_nb0, ulong dst_nb1, ulong dst_nb2, ulong dst_nb3 | ||||||
|  | ) { | ||||||
|  |     global const char * src0_data = src0_data_in + src0_offset; | ||||||
|  |     global       char * dst_data  = dst_data_in + dst_offset; | ||||||
|  |  | ||||||
|  |     const int d3 = get_global_id(2); | ||||||
|  |     const int d2 = get_global_id(1); | ||||||
|  |     const int d1 = get_global_id(0); | ||||||
|  |  | ||||||
|  |     if (d3 >= dst_ne3 || d2 >= dst_ne2 || d1 >= dst_ne1) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     const int s3 = d3 % src0_ne3; | ||||||
|  |     const int s2 = d2 % src0_ne2; | ||||||
|  |     const int s1 = d1 % src0_ne1; | ||||||
|  |  | ||||||
|  |     const global char * p_src0_slice = src0_data + (ulong)s3*src0_nb3 + (ulong)s2*src0_nb2 + (ulong)s1*src0_nb1; | ||||||
|  |     global char * p_dst_slice  = dst_data  + (ulong)d3*dst_nb3 + (ulong)d2*dst_nb2 + (ulong)d1*dst_nb1; | ||||||
|  |  | ||||||
|  |     for (int d0 = 0; d0 < dst_ne0; ++d0) { | ||||||
|  |         // Determine source index for dimension 0 based on tiling/broadcasting. | ||||||
|  |         const int s0 = d0 % src0_ne0; | ||||||
|  |  | ||||||
|  |         const global char * restrict current_src_el_ptr = p_src0_slice + (ulong)s0*src0_nb0; | ||||||
|  |         global char * restrict current_dst_el_ptr  = p_dst_slice  + (ulong)d0*dst_nb0; | ||||||
|  |         for (int k = 0; k < src0_nb0; ++k) { | ||||||
|  |             current_dst_el_ptr[k] = current_src_el_ptr[k]; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										63
									
								
								ggml/src/ggml-opencl/kernels/tanh.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63
									
								
								ggml/src/ggml-opencl/kernels/tanh.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,63 @@ | |||||||
|  | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||||
|  |  | ||||||
|  | #ifdef cl_intel_required_subgroup_size | ||||||
|  | #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable | ||||||
|  | #define INTEL_GPU 1 | ||||||
|  | #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) | ||||||
|  | #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) | ||||||
|  | #elif defined(cl_qcom_reqd_sub_group_size) | ||||||
|  | #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable | ||||||
|  | #define ADRENO_GPU 1 | ||||||
|  | #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half"))) | ||||||
|  | #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | kernel void kernel_tanh_f32_nd( | ||||||
|  |     global void * p_src0_base, ulong off_src0_abs, | ||||||
|  |     global void * p_dst_base,  ulong off_dst_abs, | ||||||
|  |     int ne00, int ne01, int ne02, int ne03, | ||||||
|  |     ulong nb00, ulong nb01, ulong nb02, ulong nb03, | ||||||
|  |     int ne10, int ne11, int ne12, int ne13, | ||||||
|  |     ulong nb10, ulong nb11, ulong nb12, ulong nb13 | ||||||
|  | ) { | ||||||
|  |     int i0 = get_global_id(0); | ||||||
|  |     int i1 = get_global_id(1); | ||||||
|  |     int i2 = get_global_id(2); | ||||||
|  |  | ||||||
|  |     if (i0 < ne10 && i1 < ne11 && i2 < ne12) { | ||||||
|  |         for (int i3 = 0; i3 < ne13; ++i3) { | ||||||
|  |             ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03; | ||||||
|  |             global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor); | ||||||
|  |  | ||||||
|  |             ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13; | ||||||
|  |             global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor); | ||||||
|  |  | ||||||
|  |             *dst_val_ptr = tanh(*src_val_ptr); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | kernel void kernel_tanh_f16_nd( | ||||||
|  |     global void * p_src0_base, ulong off_src0_abs, | ||||||
|  |     global void * p_dst_base,  ulong off_dst_abs, | ||||||
|  |     int ne00, int ne01, int ne02, int ne03, | ||||||
|  |     ulong nb00, ulong nb01, ulong nb02, ulong nb03, | ||||||
|  |     int ne10, int ne11, int ne12, int ne13, | ||||||
|  |     ulong nb10, ulong nb11, ulong nb12, ulong nb13 | ||||||
|  | ) { | ||||||
|  |     int i0 = get_global_id(0); | ||||||
|  |     int i1 = get_global_id(1); | ||||||
|  |     int i2 = get_global_id(2); | ||||||
|  |  | ||||||
|  |     if (i0 < ne10 && i1 < ne11 && i2 < ne12) { | ||||||
|  |         for (int i3 = 0; i3 < ne13; ++i3) { | ||||||
|  |             ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03; | ||||||
|  |             global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor); | ||||||
|  |  | ||||||
|  |             ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13; | ||||||
|  |             global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor); | ||||||
|  |  | ||||||
|  |             *dst_val_ptr = tanh(*src_val_ptr); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										48
									
								
								ggml/src/ggml-opencl/kernels/tsembd.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								ggml/src/ggml-opencl/kernels/tsembd.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | |||||||
|  | kernel void kernel_timestep_embedding( | ||||||
|  |     global const void * p_timesteps, | ||||||
|  |     ulong off_timesteps, | ||||||
|  |     global void * p_dst, | ||||||
|  |     ulong off_dst, | ||||||
|  |     int dst_nb1_bytes, | ||||||
|  |     int logical_dim, | ||||||
|  |     int max_period | ||||||
|  | ) { | ||||||
|  |     int local_i; | ||||||
|  |     int local_j; | ||||||
|  |     int local_half_dim; | ||||||
|  |     float local_timestep_val; | ||||||
|  |     float local_freq; | ||||||
|  |     float local_arg; | ||||||
|  |     global float * local_embed_data_ptr; | ||||||
|  |     global const float * local_timesteps_input_ptr; | ||||||
|  |     global float * local_dst_output_base_ptr; | ||||||
|  |  | ||||||
|  |     local_timesteps_input_ptr = (global const float *)((global char *)p_timesteps + off_timesteps); | ||||||
|  |     local_dst_output_base_ptr = (global float *)((global char *)p_dst + off_dst); | ||||||
|  |  | ||||||
|  |     local_i = get_global_id(1); | ||||||
|  |     local_j = get_global_id(0); | ||||||
|  |  | ||||||
|  |     local_half_dim = logical_dim / 2; | ||||||
|  |     local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes); | ||||||
|  |  | ||||||
|  |     if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) { | ||||||
|  |         local_embed_data_ptr[logical_dim] = 0.0f; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     if (local_j >= local_half_dim) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     local_timestep_val = local_timesteps_input_ptr[local_i]; | ||||||
|  |  | ||||||
|  |     if (local_half_dim == 0) { | ||||||
|  |         local_freq = 1.0f; | ||||||
|  |     } else { | ||||||
|  |         local_freq = exp(-log((float)max_period) * (float)local_j / (float)local_half_dim); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     local_arg = local_timestep_val * local_freq; | ||||||
|  |     local_embed_data_ptr[local_j] = cos(local_arg); | ||||||
|  |     local_embed_data_ptr[local_j + local_half_dim] = sin(local_arg); | ||||||
|  | } | ||||||
							
								
								
									
										121
									
								
								ggml/src/ggml-opencl/kernels/upscale.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										121
									
								
								ggml/src/ggml-opencl/kernels/upscale.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,121 @@ | |||||||
|  | kernel void kernel_upscale( | ||||||
|  |     global const void * p_src0, | ||||||
|  |     ulong off_src0, | ||||||
|  |     global void * p_dst, | ||||||
|  |     ulong off_dst, | ||||||
|  |     ulong nb00, | ||||||
|  |     ulong nb01, | ||||||
|  |     ulong nb02, | ||||||
|  |     ulong nb03, | ||||||
|  |     int ne10, | ||||||
|  |     int ne11, | ||||||
|  |     int ne12, | ||||||
|  |     int ne13, | ||||||
|  |     float sf0, | ||||||
|  |     float sf1, | ||||||
|  |     float sf2, | ||||||
|  |     float sf3 | ||||||
|  | ) { | ||||||
|  |     global const char * src_base = (global const char *)p_src0 + off_src0; | ||||||
|  |     global float * dst_base = (global float *)((global char *)p_dst + off_dst); | ||||||
|  |  | ||||||
|  |     int index = get_global_id(0); | ||||||
|  |     int dst_total_elements = ne10 * ne11 * ne12 * ne13; | ||||||
|  |  | ||||||
|  |     if (index >= dst_total_elements) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     int i10 = index % ne10; | ||||||
|  |     int i11 = (index / ne10) % ne11; | ||||||
|  |     int i12 = (index / (ne10 * ne11)) % ne12; | ||||||
|  |     int i13 = index / (ne10 * ne11 * ne12); | ||||||
|  |  | ||||||
|  |     int i00 = (int)(i10 / sf0); | ||||||
|  |     int i01 = (int)(i11 / sf1); | ||||||
|  |     int i02 = (int)(i12 / sf2); | ||||||
|  |     int i03 = (int)(i13 / sf3); | ||||||
|  |  | ||||||
|  |     ulong offset_src_element = (ulong)i03 * nb03 + (ulong)i02 * nb02 + (ulong)i01 * nb01 + (ulong)i00 * nb00; | ||||||
|  |     global const float * src_element_ptr = (global const float *)(src_base + offset_src_element); | ||||||
|  |  | ||||||
|  |     dst_base[index] = *src_element_ptr; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | kernel void kernel_upscale_bilinear( | ||||||
|  |     global const void * p_src0, | ||||||
|  |     ulong off_src0, | ||||||
|  |     global void * p_dst, | ||||||
|  |     ulong off_dst, | ||||||
|  |     ulong nb00, | ||||||
|  |     ulong nb01, | ||||||
|  |     ulong nb02, | ||||||
|  |     ulong nb03, | ||||||
|  |     int ne00_src, | ||||||
|  |     int ne01_src, | ||||||
|  |     int ne10_dst, | ||||||
|  |     int ne11_dst, | ||||||
|  |     int ne12_dst, | ||||||
|  |     int ne13_dst, | ||||||
|  |     float sf0, | ||||||
|  |     float sf1, | ||||||
|  |     float sf2, | ||||||
|  |     float sf3 | ||||||
|  | ) { | ||||||
|  |     global const char * src_base = (global const char *)p_src0 + off_src0; | ||||||
|  |     global float * dst_base = (global float *)((global char *)p_dst + off_dst); | ||||||
|  |  | ||||||
|  |     int index = get_global_id(0); | ||||||
|  |     int dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst; | ||||||
|  |  | ||||||
|  |     if (index >= dst_total_elements) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     int i10_dst = index % ne10_dst; | ||||||
|  |     int i11_dst = (index / ne10_dst) % ne11_dst; | ||||||
|  |     int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst; | ||||||
|  |     int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst); | ||||||
|  |  | ||||||
|  |     int i02_src = (int)(i12_dst / sf2); | ||||||
|  |     int i03_src = (int)(i13_dst / sf3); | ||||||
|  |  | ||||||
|  |     const float pixel_offset = 0.5f; | ||||||
|  |  | ||||||
|  |     float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset; | ||||||
|  |     long y0_src = (long)floor(y_src_f); | ||||||
|  |     long y1_src = y0_src + 1; | ||||||
|  |  | ||||||
|  |     y0_src = max(0L, min(y0_src, (long)ne01_src - 1)); | ||||||
|  |     y1_src = max(0L, min(y1_src, (long)ne01_src - 1)); | ||||||
|  |  | ||||||
|  |     float dy = y_src_f - (float)y0_src; | ||||||
|  |     dy = max(0.0f, min(dy, 1.0f)); | ||||||
|  |  | ||||||
|  |     float x_src_f = ((float)i10_dst + pixel_offset) / sf0 - pixel_offset; | ||||||
|  |     long x0_src = (long)floor(x_src_f); | ||||||
|  |     long x1_src = x0_src + 1; | ||||||
|  |  | ||||||
|  |     x0_src = max(0L, min(x0_src, (long)ne00_src - 1)); | ||||||
|  |     x1_src = max(0L, min(x1_src, (long)ne00_src - 1)); | ||||||
|  |  | ||||||
|  |     float dx = x_src_f - (float)x0_src; | ||||||
|  |     dx = max(0.0f, min(dx, 1.0f)); | ||||||
|  |  | ||||||
|  |     global const float * p_a = (global const float *)(src_base + (ulong)x0_src * nb00 + (ulong)y0_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03); | ||||||
|  |     global const float * p_b = (global const float *)(src_base + (ulong)x1_src * nb00 + (ulong)y0_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03); | ||||||
|  |     global const float * p_c = (global const float *)(src_base + (ulong)x0_src * nb00 + (ulong)y1_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03); | ||||||
|  |     global const float * p_d = (global const float *)(src_base + (ulong)x1_src * nb00 + (ulong)y1_src * nb01 + (ulong)i02_src * nb02 + (ulong)i03_src * nb03); | ||||||
|  |  | ||||||
|  |     const float val_a = *p_a; | ||||||
|  |     const float val_b = *p_b; | ||||||
|  |     const float val_c = *p_c; | ||||||
|  |     const float val_d = *p_d; | ||||||
|  |  | ||||||
|  |     float result = val_a * (1.0f - dx) * (1.0f - dy) + | ||||||
|  |                    val_b * dx * (1.0f - dy) + | ||||||
|  |                    val_c * (1.0f - dx) * dy + | ||||||
|  |                    val_d * dx * dy; | ||||||
|  |  | ||||||
|  |     dst_base[index] = result; | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user
	 rmatif
					rmatif