model : Apertus model implementation (#15852)

* First attempt

* No permute during convert (fixes qk tensors), proper norm application.

* RoPE = NeoX

* Coherence!

* Migrate xielu params from tensors to hyperparameters

* Simple CUDA kernel

* Revert stupid LLM refactorings

* Chat template support

* configchecker / flake8 errors

* Reorder unary.cu

* I do conclude that LLMs are, in fact, stupid.

* Fix after merge

* Final newline

* Make xIELU an UNARY_OP

* Final newline

* Correctly account for parameter shift

* Argh.

* Update ggml/src/ggml-cpu/unary-ops.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Refactor: remove unused methods, inline and factorize softplus, add const modifiers

* Revert CUDA changes, implement xIELU as a separate OP

* Pesky newline

* Add float2half / half2float for F16 inputs/outputs

* CUDA variants, attempt 2

* Actually, attempt 3

* Update ggml/src/ggml-cuda/unary.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Missing convert header

* Proper formula and reference for xIELU in the comments.

* Modify unary-ops.cpp to add the functor-based logic besides the template system to retain optimizations

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Add tensor mappings for Apertus to global list instead

* Fix lazy on scalars

* Update ggml/src/ggml-cuda/unary.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Add comment about the constraints on positive/negative alpha

* Change `softplus` to `ggml_softplus`

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Piotr Wilkin (ilintar)
2025-10-02 19:43:22 +02:00
committed by GitHub
parent 91a2a56556
commit 34fcc5a4ac
27 changed files with 1082 additions and 7 deletions

View File

@@ -2334,6 +2334,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_UNARY_OP_ELU:
ggml_cuda_op_elu(ctx, dst);
break;
case GGML_UNARY_OP_XIELU:
ggml_cuda_op_xielu(ctx, dst);
break;
default:
return false;
}

View File

@@ -1,4 +1,5 @@
#include "unary.cuh"
#include "convert.cuh"
static __device__ __forceinline__ float op_abs(float x) {
return fabsf(x);
@@ -375,6 +376,59 @@ void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
swiglu_oai_cuda(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), alpha, limit, stream);
}
/* CUDA kernel + launcher for xIELU */
template <typename T>
static __global__ void xielu_kernel(const T * x, T * dst, const int k, float alpha_n, float alpha_p, float beta, float eps) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
const float xi = ggml_cuda_cast<float>(x[i]);
const float gate_pos = (xi > 0.0f);
const float y_pos = alpha_p * xi * xi + beta * xi;
const float min_v_eps = fminf(xi, eps);
const float y_neg = (expm1f(min_v_eps) - xi) * alpha_n + beta * xi;
const float out = gate_pos * y_pos + (1.0f - gate_pos) * y_neg;
dst[i] = ggml_cuda_cast<T>(out);
}
template <typename T>
static void xielu_cuda(const T * x, T * dst, const int k, float alpha_n, float alpha_p, float beta, float eps, cudaStream_t stream) {
const int num_blocks = (k + CUDA_XIELU_BLOCK_SIZE) / CUDA_XIELU_BLOCK_SIZE;
xielu_kernel<<<num_blocks, CUDA_XIELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, alpha_n, alpha_p, beta, eps);
}
void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const void * src0_d = src0->data;
void * dst_d = dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
const float alpha_n = ggml_get_op_params_f32(dst, 1);
const float alpha_p = ggml_get_op_params_f32(dst, 2);
const float beta = ggml_get_op_params_f32(dst, 3);
const float eps = ggml_get_op_params_f32(dst, 4);
if (src0->type == GGML_TYPE_F16) {
xielu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), alpha_n, alpha_p, beta, eps, stream);
} else {
xielu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), alpha_n, alpha_p, beta, eps, stream);
}
}
/* silu_back */
static __device__ __forceinline__ float op_silu_back(float grad, float x) {

View File

@@ -16,6 +16,7 @@
#define CUDA_SIN_BLOCK_SIZE 256
#define CUDA_COS_BLOCK_SIZE 256
#define CUDA_GLU_BLOCK_SIZE 256
#define CUDA_XIELU_BLOCK_SIZE 256
void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -72,3 +73,5 @@ void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);