mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-12 10:47:01 +00:00
model : Apertus model implementation (#15852)
* First attempt * No permute during convert (fixes qk tensors), proper norm application. * RoPE = NeoX * Coherence! * Migrate xielu params from tensors to hyperparameters * Simple CUDA kernel * Revert stupid LLM refactorings * Chat template support * configchecker / flake8 errors * Reorder unary.cu * I do conclude that LLMs are, in fact, stupid. * Fix after merge * Final newline * Make xIELU an UNARY_OP * Final newline * Correctly account for parameter shift * Argh. * Update ggml/src/ggml-cpu/unary-ops.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Refactor: remove unused methods, inline and factorize softplus, add const modifiers * Revert CUDA changes, implement xIELU as a separate OP * Pesky newline * Add float2half / half2float for F16 inputs/outputs * CUDA variants, attempt 2 * Actually, attempt 3 * Update ggml/src/ggml-cuda/unary.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Missing convert header * Proper formula and reference for xIELU in the comments. * Modify unary-ops.cpp to add the functor-based logic besides the template system to retain optimizations * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Add tensor mappings for Apertus to global list instead * Fix lazy on scalars * Update ggml/src/ggml-cuda/unary.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Add comment about the constraints on positive/negative alpha * Change `softplus` to `ggml_softplus` --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Johannes Gäßler <johannesg@5d6.de> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
committed by
GitHub
parent
91a2a56556
commit
34fcc5a4ac
@@ -2334,6 +2334,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_ELU:
|
||||
ggml_cuda_op_elu(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_XIELU:
|
||||
ggml_cuda_op_xielu(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "unary.cuh"
|
||||
#include "convert.cuh"
|
||||
|
||||
static __device__ __forceinline__ float op_abs(float x) {
|
||||
return fabsf(x);
|
||||
@@ -375,6 +376,59 @@ void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||
swiglu_oai_cuda(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), alpha, limit, stream);
|
||||
}
|
||||
|
||||
/* CUDA kernel + launcher for xIELU */
|
||||
|
||||
template <typename T>
|
||||
static __global__ void xielu_kernel(const T * x, T * dst, const int k, float alpha_n, float alpha_p, float beta, float eps) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
const float xi = ggml_cuda_cast<float>(x[i]);
|
||||
|
||||
const float gate_pos = (xi > 0.0f);
|
||||
const float y_pos = alpha_p * xi * xi + beta * xi;
|
||||
const float min_v_eps = fminf(xi, eps);
|
||||
const float y_neg = (expm1f(min_v_eps) - xi) * alpha_n + beta * xi;
|
||||
const float out = gate_pos * y_pos + (1.0f - gate_pos) * y_neg;
|
||||
|
||||
dst[i] = ggml_cuda_cast<T>(out);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void xielu_cuda(const T * x, T * dst, const int k, float alpha_n, float alpha_p, float beta, float eps, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_XIELU_BLOCK_SIZE) / CUDA_XIELU_BLOCK_SIZE;
|
||||
xielu_kernel<<<num_blocks, CUDA_XIELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, alpha_n, alpha_p, beta, eps);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const void * src0_d = src0->data;
|
||||
void * dst_d = dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src0->type == dst->type);
|
||||
|
||||
const float alpha_n = ggml_get_op_params_f32(dst, 1);
|
||||
const float alpha_p = ggml_get_op_params_f32(dst, 2);
|
||||
const float beta = ggml_get_op_params_f32(dst, 3);
|
||||
const float eps = ggml_get_op_params_f32(dst, 4);
|
||||
|
||||
if (src0->type == GGML_TYPE_F16) {
|
||||
xielu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), alpha_n, alpha_p, beta, eps, stream);
|
||||
} else {
|
||||
xielu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), alpha_n, alpha_p, beta, eps, stream);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* silu_back */
|
||||
|
||||
static __device__ __forceinline__ float op_silu_back(float grad, float x) {
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#define CUDA_SIN_BLOCK_SIZE 256
|
||||
#define CUDA_COS_BLOCK_SIZE 256
|
||||
#define CUDA_GLU_BLOCK_SIZE 256
|
||||
#define CUDA_XIELU_BLOCK_SIZE 256
|
||||
|
||||
void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
@@ -72,3 +73,5 @@ void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||
void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
Reference in New Issue
Block a user