mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	SYCL: Add all missing unary kernels (#13074)
* SYCL: Add all missing unary kernels ggml-ci * decouple kernel launch range from data size using strided loop * use ciel_div helper for num_blocks ggml-ci * clean auto imported header files
This commit is contained in:
		| @@ -493,5 +493,9 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) { | ||||
|  | ||||
| int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size); | ||||
|  | ||||
| constexpr size_t ceil_div(const size_t m, const size_t n) { | ||||
|     return (m + n - 1) / n; | ||||
| } | ||||
|  | ||||
| bool gpu_has_xmx(sycl::device &dev); | ||||
| #endif // GGML_SYCL_COMMON_HPP | ||||
|   | ||||
| @@ -21,6 +21,27 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne, | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { | ||||
|     for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { | ||||
|         dst[i] = x[i] > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x[i] < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f))); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { | ||||
|     for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { | ||||
|         dst[i] = sycl::fabs(x[i]); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { | ||||
|     for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { | ||||
|         dst[i] = (x[i] > static_cast<T>(0.f)) ? x[i] : sycl::expm1(x[i]); | ||||
|     } | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| static void gelu(const T * x, T * dst, const int k, | ||||
|                      const sycl::nd_item<3> &item_ct1) { | ||||
| @@ -335,6 +356,37 @@ static void silu_sycl(const T *x, T *dst, const int k, | ||||
|         }); | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) { | ||||
|     // hard code for now | ||||
|     const int num_blocks = ceil_div(k, 256); | ||||
|     stream->parallel_for( | ||||
|             sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { | ||||
|             sgn(x, dst, k, item_ct1); | ||||
|             }); | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) { | ||||
|     // hard code for now | ||||
|     const int num_blocks = ceil_div(k, 256); | ||||
|     stream->parallel_for( | ||||
|             sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { | ||||
|             abs_op(x, dst, k, item_ct1); | ||||
|             }); | ||||
| } | ||||
|  | ||||
|  | ||||
| template<typename T> | ||||
| static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) { | ||||
|     // hard code for now | ||||
|     const int num_blocks = ceil_div(k, 256); | ||||
|     stream->parallel_for( | ||||
|             sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { | ||||
|             elu_op(x, dst, k, item_ct1); | ||||
|             }); | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| static void gelu_quick_sycl(const T *x, T *dst, const int k, | ||||
|                                 queue_ptr stream) { | ||||
| @@ -574,6 +626,106 @@ static void clamp_sycl(const T *x, T *dst, const float min, | ||||
|         }); | ||||
| } | ||||
|  | ||||
| inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { | ||||
| #if defined (GGML_SYCL_F16) | ||||
|     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); | ||||
|     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); | ||||
|  | ||||
| #else | ||||
|     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); | ||||
|     GGML_ASSERT(dst->type == GGML_TYPE_F32); | ||||
| #endif | ||||
|     GGML_ASSERT(dst->src[0]->type == dst->type); | ||||
|     dpct::queue_ptr main_stream = ctx.stream(); | ||||
|     SYCL_CHECK(ggml_sycl_set_device(ctx.device)); | ||||
|     switch (dst->type) { | ||||
| #if defined (GGML_SYCL_F16) | ||||
|         case GGML_TYPE_F16: | ||||
|             { | ||||
|                 auto data_pts = cast_data<sycl::half>(dst); | ||||
|                 sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); | ||||
|                 break; | ||||
|             } | ||||
| #endif | ||||
|         case GGML_TYPE_F32: | ||||
|             { | ||||
|                 auto data_pts = cast_data<float>(dst); | ||||
|                 sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); | ||||
|                 break; | ||||
|             } | ||||
|         default: | ||||
|             GGML_ABORT("GGML tensor type not supported!\n"); | ||||
|             break; | ||||
|     } | ||||
| } | ||||
|  | ||||
| inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { | ||||
| #if defined (GGML_SYCL_F16) | ||||
|     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); | ||||
|     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); | ||||
|  | ||||
| #else | ||||
|     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); | ||||
|     GGML_ASSERT(dst->type == GGML_TYPE_F32); | ||||
| #endif | ||||
|     GGML_ASSERT(dst->src[0]->type == dst->type); | ||||
|     dpct::queue_ptr main_stream = ctx.stream(); | ||||
|     SYCL_CHECK(ggml_sycl_set_device(ctx.device)); | ||||
|     switch (dst->type) { | ||||
| #if defined (GGML_SYCL_F16) | ||||
|         case GGML_TYPE_F16: | ||||
|             { | ||||
|                 auto data_pts = cast_data<sycl::half>(dst); | ||||
|                 abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); | ||||
|                 break; | ||||
|             } | ||||
| #endif | ||||
|         case GGML_TYPE_F32: | ||||
|             { | ||||
|                 auto data_pts = cast_data<float>(dst); | ||||
|                 abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); | ||||
|                 break; | ||||
|             } | ||||
|         default: | ||||
|             GGML_ABORT("GGML tensor type not supported!\n"); | ||||
|             break; | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { | ||||
| #if defined (GGML_SYCL_F16) | ||||
|     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); | ||||
|     GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); | ||||
|  | ||||
| #else | ||||
|     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); | ||||
|     GGML_ASSERT(dst->type == GGML_TYPE_F32); | ||||
| #endif | ||||
|     GGML_ASSERT(dst->src[0]->type == dst->type); | ||||
|     dpct::queue_ptr main_stream = ctx.stream(); | ||||
|     SYCL_CHECK(ggml_sycl_set_device(ctx.device)); | ||||
|     switch (dst->type) { | ||||
| #if defined (GGML_SYCL_F16) | ||||
|         case GGML_TYPE_F16: | ||||
|             { | ||||
|                 auto data_pts = cast_data<sycl::half>(dst); | ||||
|                 elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); | ||||
|                 break; | ||||
|             } | ||||
| #endif | ||||
|         case GGML_TYPE_F32: | ||||
|             { | ||||
|                 auto data_pts = cast_data<float>(dst); | ||||
|                 elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); | ||||
|                 break; | ||||
|             } | ||||
|         default: | ||||
|             GGML_ABORT("GGML tensor type not supported!\n"); | ||||
|             break; | ||||
|     } | ||||
| } | ||||
|  | ||||
| inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { | ||||
| #if defined (GGML_SYCL_F16) | ||||
|     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); | ||||
| @@ -1388,3 +1540,20 @@ void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { | ||||
|     GGML_SYCL_DEBUG("call %s done\n", __func__); | ||||
| } | ||||
|  | ||||
| void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { | ||||
|     GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type)); | ||||
|     ggml_sycl_op_sgn(ctx, dst); | ||||
|     GGML_SYCL_DEBUG("call %s done\n", __func__); | ||||
| } | ||||
|  | ||||
| void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { | ||||
|     GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type)); | ||||
|     ggml_sycl_op_abs(ctx, dst); | ||||
|     GGML_SYCL_DEBUG("call %s done\n", __func__); | ||||
| } | ||||
|  | ||||
| void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { | ||||
|     GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type)); | ||||
|     ggml_sycl_op_elu(ctx, dst); | ||||
|     GGML_SYCL_DEBUG("call %s done\n", __func__); | ||||
| } | ||||
|   | ||||
| @@ -66,5 +66,10 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst); | ||||
|  | ||||
| void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); | ||||
|  | ||||
| void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); | ||||
|  | ||||
| void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst); | ||||
|  | ||||
| void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); | ||||
| #endif // GGML_SYCL_ELEMENTWISE_HPP | ||||
|  | ||||
|   | ||||
| @@ -38,6 +38,7 @@ | ||||
|  | ||||
| #include "ggml-sycl/backend.hpp" | ||||
| #include "ggml-sycl/common.hpp" | ||||
| #include "ggml-sycl/element_wise.hpp" | ||||
| #include "ggml-sycl/presets.hpp" | ||||
| #include "ggml-sycl/gemm.hpp" | ||||
| #include "ggml-sycl/sycl_hw.hpp" | ||||
| @@ -3355,6 +3356,15 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg | ||||
|                 case GGML_UNARY_OP_EXP: | ||||
|                     ggml_sycl_exp(ctx, dst); | ||||
|                     break; | ||||
|                 case GGML_UNARY_OP_SGN: | ||||
|                     ggml_sycl_sgn(ctx, dst); | ||||
|                     break; | ||||
|                 case GGML_UNARY_OP_ABS: | ||||
|                     ggml_sycl_abs(ctx, dst); | ||||
|                     break; | ||||
|                 case GGML_UNARY_OP_ELU: | ||||
|                     ggml_sycl_elu(ctx, dst); | ||||
|                     break; | ||||
|                 default: | ||||
|                     return false; | ||||
|             } | ||||
| @@ -3837,6 +3847,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g | ||||
|                 case GGML_UNARY_OP_GELU_QUICK: | ||||
|                 case GGML_UNARY_OP_TANH: | ||||
|                 case GGML_UNARY_OP_EXP: | ||||
|                 case GGML_UNARY_OP_SGN: | ||||
|                 case GGML_UNARY_OP_ABS: | ||||
|                 case GGML_UNARY_OP_ELU: | ||||
| #if defined (GGML_SYCL_F16) | ||||
|                     return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type); | ||||
| #else | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Akarshan Biswas
					Akarshan Biswas