model : Apertus model implementation (#15852)

* First attempt

* No permute during convert (fixes qk tensors), proper norm application.

* RoPE = NeoX

* Coherence!

* Migrate xielu params from tensors to hyperparameters

* Simple CUDA kernel

* Revert stupid LLM refactorings

* Chat template support

* configchecker / flake8 errors

* Reorder unary.cu

* I do conclude that LLMs are, in fact, stupid.

* Fix after merge

* Final newline

* Make xIELU an UNARY_OP

* Final newline

* Correctly account for parameter shift

* Argh.

* Update ggml/src/ggml-cpu/unary-ops.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Refactor: remove unused methods, inline and factorize softplus, add const modifiers

* Revert CUDA changes, implement xIELU as a separate OP

* Pesky newline

* Add float2half / half2float for F16 inputs/outputs

* CUDA variants, attempt 2

* Actually, attempt 3

* Update ggml/src/ggml-cuda/unary.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Missing convert header

* Proper formula and reference for xIELU in the comments.

* Modify unary-ops.cpp to add the functor-based logic besides the template system to retain optimizations

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Add tensor mappings for Apertus to global list instead

* Fix lazy on scalars

* Update ggml/src/ggml-cuda/unary.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Add comment about the constraints on positive/negative alpha

* Change `softplus` to `ggml_softplus`

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Piotr Wilkin (ilintar)
2025-10-02 19:43:22 +02:00
committed by GitHub
parent 91a2a56556
commit 34fcc5a4ac
27 changed files with 1082 additions and 7 deletions

View File

@@ -2187,6 +2187,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_UNARY_OP_GELU_ERF:
case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_XIELU:
{
n_tasks = n_threads;
} break;

View File

@@ -8637,7 +8637,7 @@ static void ggml_compute_forward_ssm_scan_f32(
// n_head
for (int h = ih0; h < ih1; ++h) {
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
const float dt_soft_plus = ggml_softplus(dt[h]);
const float dA = expf(dt_soft_plus * A[h]);
const int g = h / (nh / ng); // repeat_interleave
@@ -8734,7 +8734,7 @@ static void ggml_compute_forward_ssm_scan_f32(
// n_head
for (int h = ih0; h < ih1; ++h) {
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
const float dt_soft_plus = ggml_softplus(dt[h]);
const int g = h / (nh / ng); // repeat_interleave
// dim
@@ -8997,6 +8997,10 @@ void ggml_compute_forward_unary(
{
ggml_compute_forward_exp(params, dst);
} break;
case GGML_UNARY_OP_XIELU:
{
ggml_compute_forward_xielu(params, dst);
} break;
default:
{
GGML_ABORT("fatal error");

View File

@@ -52,6 +52,15 @@ static inline float op_sqrt(float x) {
return sqrtf(x);
}
static inline float op_xielu(float x, float alpha_n, float alpha_p, float beta, float eps) {
if (x > 0.0f) {
return alpha_p * x * x + beta * x;
} else {
const float min_x_eps = fminf(x, eps);
return (expm1f(min_x_eps) - x) * alpha_n + beta * x;
}
}
static inline float op_sin(float x) {
return sinf(x);
}
@@ -121,6 +130,86 @@ static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
}
}
template <float (*op)(float, ggml_tensor *)>
static void unary_op_params(const ggml_compute_params * params, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
apply_unary_op<op, float, float>(params, dst);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
apply_unary_op<op, ggml_bf16_t, float>(params, dst);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
apply_unary_op<op, ggml_fp16_t, float>(params, dst);
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type));
GGML_ABORT("fatal error");
}
}
// Extend vec_unary_op to support functors
template <typename Op, typename src0_t, typename dst_t>
static inline void vec_unary_op_functor(int64_t n, dst_t * y, const src0_t * x, Op op) {
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
for (int i = 0; i < n; i++) {
y[i] = f32_to_dst(op(src0_to_f32(x[i])));
}
}
// Extend apply_unary_op to support functors
template <typename Op, typename src0_t, typename dst_t>
static void apply_unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
GGML_TENSOR_UNARY_OP_LOCALS
GGML_ASSERT( nb0 == sizeof(dst_t));
GGML_ASSERT(nb00 == sizeof(src0_t));
const auto [ir0, ir1] = get_thread_range(params, src0);
for (int64_t ir = ir0; ir < ir1; ++ir) {
const int64_t i03 = ir/(ne02*ne01);
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
vec_unary_op_functor(ne0, dst_ptr, src0_ptr, op);
}
}
// Generic dispatcher for functors
template <typename Op>
static void unary_op_functor(const ggml_compute_params * params, ggml_tensor * dst, Op op) {
const ggml_tensor * src0 = dst->src[0];
/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
apply_unary_op_functor<Op, float, float>(params, dst, op);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
apply_unary_op_functor<Op, ggml_fp16_t, ggml_fp16_t>(params, dst, op);
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
apply_unary_op_functor<Op, ggml_bf16_t, ggml_bf16_t>(params, dst, op);
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
apply_unary_op_functor<Op, ggml_bf16_t, float>(params, dst, op);
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
apply_unary_op_functor<Op, ggml_fp16_t, float>(params, dst, op);
} else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type));
GGML_ABORT("fatal error");
}
}
void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_abs>(params, dst);
}
@@ -184,3 +273,17 @@ void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor *
void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
unary_op<op_log>(params, dst);
}
void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
const float alpha_n = ggml_get_op_params_f32(dst, 1);
const float alpha_p = ggml_get_op_params_f32(dst, 2);
const float beta = ggml_get_op_params_f32(dst, 3);
const float eps = ggml_get_op_params_f32(dst, 4);
const auto xielu_op_params = [alpha_n, alpha_p, beta, eps](float f) {
return op_xielu(f, alpha_n, alpha_p, beta, eps);
};
unary_op_functor(params, dst, xielu_op_params);
}

View File

@@ -22,6 +22,7 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
#ifdef __cplusplus
}