mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-02 09:12:03 +00:00
ggml : implement REGLU/GEGLU/SWIGLU ops (#14158)
* implement unary REGLU/GEGLU/SWIGLU cpu ops * relax constraints * duplicate shape of source * fix ggml_vec_geglu_f16 * special case gated ops * implement unary REGLU/GEGLU/SWIGLU cuda ops * tighten constraints again * refactor into GGML_GLU_OP * metal : add glu kernels ggml-ci * add CUDA_GLU_BLOCK_SIZE [no ci] * more constraints and use 64bit ints ggml-ci * 64bit multiplication [no ci] * implement swapped variants (cpu/cuda) * update comment [no ci] ggml-ci * Vulkan: Add GLU ops and shaders * SYCL: Implement fused kernel GEGLU, SWIGLU and REGLU for single up+gate * ggml : implement GLU for split up/gate (#14181) * implement GLU for split up/gate * add tests for ggml_glu_split * Vulkan: Implement glu_split logic and shader support * add split to logging [no ci] * SYCL: refactor element_size ops and add split up and gate support to gated kernels * SYCL: switch GEGLU to use tanh approximation --------- Co-authored-by: 0cc4m <picard12@live.de> Co-authored-by: Akarshan <akarshan@menlo.ai> * GGML: increase OP count in assertion * Refactor: Optimize SYCL element-wise operations with unary function inlining This commit refactors the SYCL element-wise operations to improve performance by: - Inlining unary operations (sgn, abs, elu, gelu, silu, etc.) to reduce kernel launch overhead. - Introducing helper functions `op_xxx` for each unary operation to encapsulate the logic. - Replacing direct kernel calls with calls to these inlined functions. - Using `__dpct_inline__` to encourage compiler inlining. - Minor code cleanup and consistency improvements. The changes aim to reduce kernel launch overhead and improve the overall efficiency of element-wise operations on SYCL devices. * vulkan: Increase workgroup size for GLU, for performance (#14345) * vulkan: Increase workgroup size for GLU, for performance * vulkan: change GLU shaders to do one element per invocation rather than one row per workgroup * merge fix * metal : add support for split and swap ggml-ci --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: 0cc4m <picard12@live.de> Co-authored-by: Akarshan <akarshan@menlo.ai> Co-authored-by: Jeff Bolz <jbolz@nvidia.com>
This commit is contained in:
138
ggml/src/ggml.c
138
ggml/src/ggml.c
@@ -982,9 +982,11 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"CROSS_ENTROPY_LOSS",
|
||||
"CROSS_ENTROPY_LOSS_BACK",
|
||||
"OPT_STEP_ADAMW",
|
||||
|
||||
"GLU",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
|
||||
static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -1079,9 +1081,11 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"cross_entropy_loss(x,y)",
|
||||
"cross_entropy_loss_back(x,y)",
|
||||
"adamw(x)",
|
||||
|
||||
"glu(x)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");
|
||||
static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -1107,6 +1111,15 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
||||
static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15");
|
||||
|
||||
|
||||
static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
|
||||
"REGLU",
|
||||
"GEGLU",
|
||||
"SWIGLU",
|
||||
};
|
||||
|
||||
static_assert(GGML_GLU_OP_COUNT == 3, "GGML_GLU_OP_COUNT != 3");
|
||||
|
||||
|
||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
||||
|
||||
@@ -1209,11 +1222,19 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
||||
return GGML_UNARY_OP_NAME[op];
|
||||
}
|
||||
|
||||
const char * ggml_glu_op_name(enum ggml_glu_op op) {
|
||||
return GGML_GLU_OP_NAME[op];
|
||||
}
|
||||
|
||||
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
||||
if (t->op == GGML_OP_UNARY) {
|
||||
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
||||
return ggml_unary_op_name(uop);
|
||||
}
|
||||
if (t->op == GGML_OP_GLU) {
|
||||
enum ggml_glu_op gop = ggml_get_glu_op(t);
|
||||
return ggml_glu_op_name(gop);
|
||||
}
|
||||
return ggml_op_name(t->op);
|
||||
}
|
||||
|
||||
@@ -1730,6 +1751,11 @@ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
||||
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
||||
}
|
||||
|
||||
enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->op == GGML_OP_GLU);
|
||||
return (enum ggml_glu_op) ggml_get_op_params_i32(tensor, 0);
|
||||
}
|
||||
|
||||
const char * ggml_get_name(const struct ggml_tensor * tensor) {
|
||||
return tensor->name;
|
||||
}
|
||||
@@ -2609,6 +2635,114 @@ struct ggml_tensor * ggml_exp_inplace(
|
||||
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_EXP);
|
||||
}
|
||||
|
||||
// ggml_glu
|
||||
|
||||
static struct ggml_tensor * ggml_glu_impl(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
enum ggml_glu_op op,
|
||||
bool swapped) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(a));
|
||||
|
||||
if (b) {
|
||||
GGML_ASSERT(ggml_is_contiguous_1(b));
|
||||
GGML_ASSERT(ggml_are_same_shape(a, b));
|
||||
GGML_ASSERT(a->type == b->type);
|
||||
}
|
||||
|
||||
int64_t ne[GGML_MAX_DIMS] = { a->ne[0] / 2 }; for (int i = 1; i < GGML_MAX_DIMS; i++) ne[i] = a->ne[i];
|
||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b ? a->ne : ne, NULL, 0);
|
||||
|
||||
ggml_set_op_params_i32(result, 0, (int32_t) op);
|
||||
ggml_set_op_params_i32(result, 1, (int32_t) swapped);
|
||||
|
||||
result->op = GGML_OP_GLU;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_glu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_glu_op op,
|
||||
bool swapped) {
|
||||
return ggml_glu_impl(ctx, a, NULL, op, swapped);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_glu_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
enum ggml_glu_op op) {
|
||||
return ggml_glu_impl(ctx, a, b, op, false);
|
||||
}
|
||||
|
||||
// ggml_reglu
|
||||
|
||||
struct ggml_tensor * ggml_reglu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_REGLU, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_reglu_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_REGLU, true);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_reglu_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b) {
|
||||
return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_REGLU, false);
|
||||
}
|
||||
|
||||
// ggml_geglu
|
||||
|
||||
struct ggml_tensor * ggml_geglu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_geglu_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU, true);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_geglu_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b) {
|
||||
return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU, false);
|
||||
}
|
||||
|
||||
// ggml_swiglu
|
||||
|
||||
struct ggml_tensor * ggml_swiglu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_SWIGLU, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_swiglu_swapped(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_SWIGLU, true);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_swiglu_split(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b) {
|
||||
return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_SWIGLU, false);
|
||||
}
|
||||
|
||||
// ggml_norm
|
||||
|
||||
static struct ggml_tensor * ggml_norm_impl(
|
||||
|
||||
Reference in New Issue
Block a user