mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-07 09:57:00 +00:00
Merge branch 'master' into compilade/mamba2
This commit is contained in:
@@ -198,7 +198,7 @@
|
||||
|
||||
#ifndef __GNUC__
|
||||
# define GGML_ATTRIBUTE_FORMAT(...)
|
||||
#elif defined(__MINGW32__)
|
||||
#elif defined(__MINGW32__) && !defined(__clang__)
|
||||
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
#else
|
||||
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
@@ -237,13 +237,9 @@
|
||||
#define GGML_EXIT_SUCCESS 0
|
||||
#define GGML_EXIT_ABORTED 1
|
||||
|
||||
#define GGML_ROPE_TYPE_NEOX 2
|
||||
|
||||
#define GGUF_MAGIC "GGUF"
|
||||
|
||||
#define GGUF_VERSION 3
|
||||
|
||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||
#define GGML_ROPE_TYPE_NEOX 2
|
||||
#define GGML_ROPE_TYPE_MROPE 8
|
||||
#define GGML_ROPE_TYPE_VISION 24
|
||||
|
||||
#define GGML_UNUSED(x) (void)(x)
|
||||
|
||||
@@ -384,12 +380,15 @@ extern "C" {
|
||||
GGML_TYPE_F64 = 28,
|
||||
GGML_TYPE_IQ1_M = 29,
|
||||
GGML_TYPE_BF16 = 30,
|
||||
GGML_TYPE_Q4_0_4_4 = 31,
|
||||
GGML_TYPE_Q4_0_4_8 = 32,
|
||||
GGML_TYPE_Q4_0_8_8 = 33,
|
||||
// GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
|
||||
// GGML_TYPE_Q4_0_4_8 = 32,
|
||||
// GGML_TYPE_Q4_0_8_8 = 33,
|
||||
GGML_TYPE_TQ1_0 = 34,
|
||||
GGML_TYPE_TQ2_0 = 35,
|
||||
GGML_TYPE_COUNT,
|
||||
// GGML_TYPE_IQ4_NL_4_4 = 36,
|
||||
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
||||
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
||||
GGML_TYPE_COUNT = 39,
|
||||
};
|
||||
|
||||
// precision
|
||||
@@ -398,12 +397,6 @@ extern "C" {
|
||||
GGML_PREC_F32,
|
||||
};
|
||||
|
||||
enum ggml_backend_type {
|
||||
GGML_BACKEND_TYPE_CPU = 0,
|
||||
GGML_BACKEND_TYPE_GPU = 10,
|
||||
GGML_BACKEND_TYPE_GPU_SPLIT = 20,
|
||||
};
|
||||
|
||||
// model file types
|
||||
enum ggml_ftype {
|
||||
GGML_FTYPE_UNKNOWN = -1,
|
||||
@@ -430,9 +423,6 @@ extern "C" {
|
||||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
@@ -496,6 +486,7 @@ extern "C" {
|
||||
GGML_OP_POOL_2D_BACK,
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_PAD,
|
||||
GGML_OP_PAD_REFLECT_1D,
|
||||
GGML_OP_ARANGE,
|
||||
GGML_OP_TIMESTEP_EMBEDDING,
|
||||
GGML_OP_ARGSORT,
|
||||
@@ -510,6 +501,7 @@ extern "C" {
|
||||
GGML_OP_GET_REL_POS,
|
||||
GGML_OP_ADD_REL_POS,
|
||||
GGML_OP_RWKV_WKV6,
|
||||
GGML_OP_GATED_LINEAR_ATTN,
|
||||
|
||||
GGML_OP_UNARY,
|
||||
|
||||
@@ -584,8 +576,6 @@ extern "C" {
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
|
||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||
|
||||
struct ggml_backend_buffer * buffer;
|
||||
|
||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||
@@ -1394,16 +1384,20 @@ extern "C" {
|
||||
float scale,
|
||||
float max_bias);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_back(
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b,
|
||||
float scale,
|
||||
float max_bias);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
|
||||
GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b,
|
||||
float scale,
|
||||
float max_bias);
|
||||
|
||||
// rotary position embedding
|
||||
// if (mode & 1) - skip n_past elements (NOT SUPPORTED)
|
||||
@@ -1442,6 +1436,22 @@ extern "C" {
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rope_multi(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
int n_dims,
|
||||
int sections[4],
|
||||
int mode,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
||||
struct ggml_context * ctx,
|
||||
@@ -1494,7 +1504,7 @@ extern "C" {
|
||||
|
||||
// rotary position embedding backward, i.e compute dx from dy
|
||||
// a - dy
|
||||
GGML_API struct ggml_tensor * ggml_rope_back(
|
||||
GGML_API struct ggml_tensor * ggml_rope_ext_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // gradients of ggml_rope result
|
||||
struct ggml_tensor * b, // positions
|
||||
@@ -1509,6 +1519,23 @@ extern "C" {
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rope_multi_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
int n_dims,
|
||||
int sections[4],
|
||||
int mode,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow);
|
||||
|
||||
|
||||
// clamp
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_clamp(
|
||||
@@ -1545,17 +1572,6 @@ extern "C" {
|
||||
int d1, // dilation dimension 1
|
||||
bool is_2D);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride dimension 0
|
||||
int s1, // stride dimension 1
|
||||
int p0, // padding dimension 0
|
||||
int p1, // padding dimension 1
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
@@ -1573,6 +1589,23 @@ extern "C" {
|
||||
int s, // stride
|
||||
int d); // dilation
|
||||
|
||||
// depthwise
|
||||
// TODO: this is very likely wrong for some cases! - needs more testing
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d_dw(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride
|
||||
int p0, // padding
|
||||
int d0); // dilation
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride
|
||||
int d0); // dilation
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
@@ -1592,7 +1625,6 @@ extern "C" {
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
|
||||
// kernel size is a->ne[0] x a->ne[1]
|
||||
// stride is equal to kernel size
|
||||
// padding is zero
|
||||
@@ -1619,6 +1651,18 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// depthwise
|
||||
GGML_API struct ggml_tensor * ggml_conv_2d_dw(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a, // convolution kernel
|
||||
struct ggml_tensor * b, // data
|
||||
int s0, // stride dimension 0
|
||||
int s1, // stride dimension 1
|
||||
int p0, // padding dimension 0
|
||||
int p1, // padding dimension 1
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -1692,6 +1736,13 @@ extern "C" {
|
||||
int p2,
|
||||
int p3);
|
||||
|
||||
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
||||
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
||||
// timesteps: [N,]
|
||||
// return: [N, dim]
|
||||
@@ -1724,7 +1775,7 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
int k);
|
||||
|
||||
#define GGML_KQ_MASK_PAD 32
|
||||
#define GGML_KQ_MASK_PAD 64
|
||||
|
||||
// q: [n_embd, n_batch, n_head, 1]
|
||||
// k: [n_embd, n_kv, n_head_kv, 1]
|
||||
@@ -1831,6 +1882,15 @@ extern "C" {
|
||||
struct ggml_tensor * td,
|
||||
struct ggml_tensor * state);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_gated_linear_attn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * q,
|
||||
struct ggml_tensor * g,
|
||||
struct ggml_tensor * state,
|
||||
float scale);
|
||||
|
||||
// custom operators
|
||||
|
||||
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
||||
@@ -2069,137 +2129,19 @@ extern "C" {
|
||||
int64_t n_per_row,
|
||||
const float * imatrix);
|
||||
|
||||
//
|
||||
// gguf
|
||||
//
|
||||
|
||||
enum gguf_type {
|
||||
GGUF_TYPE_UINT8 = 0,
|
||||
GGUF_TYPE_INT8 = 1,
|
||||
GGUF_TYPE_UINT16 = 2,
|
||||
GGUF_TYPE_INT16 = 3,
|
||||
GGUF_TYPE_UINT32 = 4,
|
||||
GGUF_TYPE_INT32 = 5,
|
||||
GGUF_TYPE_FLOAT32 = 6,
|
||||
GGUF_TYPE_BOOL = 7,
|
||||
GGUF_TYPE_STRING = 8,
|
||||
GGUF_TYPE_ARRAY = 9,
|
||||
GGUF_TYPE_UINT64 = 10,
|
||||
GGUF_TYPE_INT64 = 11,
|
||||
GGUF_TYPE_FLOAT64 = 12,
|
||||
GGUF_TYPE_COUNT, // marks the end of the enum
|
||||
};
|
||||
|
||||
struct gguf_context;
|
||||
|
||||
struct gguf_init_params {
|
||||
bool no_alloc;
|
||||
|
||||
// if not NULL, create a ggml_context and allocate the tensor data in it
|
||||
struct ggml_context ** ctx;
|
||||
};
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
|
||||
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
|
||||
|
||||
GGML_API void gguf_free(struct gguf_context * ctx);
|
||||
|
||||
GGML_API const char * gguf_type_name(enum gguf_type type);
|
||||
|
||||
GGML_API int gguf_get_version (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
|
||||
GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
|
||||
GGML_API void * gguf_get_data (const struct gguf_context * ctx);
|
||||
|
||||
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
||||
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
|
||||
|
||||
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
|
||||
|
||||
// will abort if the wrong type is used for the key
|
||||
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
|
||||
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
||||
|
||||
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
||||
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
||||
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
||||
|
||||
// removes key if it exists
|
||||
GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
|
||||
|
||||
// overrides existing values or adds a new one
|
||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||
GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
|
||||
GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
|
||||
GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
|
||||
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
||||
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
||||
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
||||
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
||||
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
||||
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
||||
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
||||
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
||||
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
||||
GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
|
||||
|
||||
// set or add KV pairs from another context
|
||||
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
|
||||
|
||||
// manage tensor info
|
||||
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
|
||||
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
|
||||
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
|
||||
|
||||
// writing gguf files can be done in 2 ways:
|
||||
//
|
||||
// - write the entire gguf_context to a binary file in a single pass:
|
||||
//
|
||||
// gguf_write_to_file(ctx, fname);
|
||||
//
|
||||
// - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
|
||||
//
|
||||
// FILE * f = fopen(fname, "wb");
|
||||
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
|
||||
// fwrite(f, ...);
|
||||
// void * data = gguf_meta_get_meta_data(ctx);
|
||||
// fseek(f, 0, SEEK_SET);
|
||||
// fwrite(f, data, gguf_get_meta_size(ctx));
|
||||
// free(data);
|
||||
// fclose(f);
|
||||
//
|
||||
|
||||
// write the entire context to a binary file
|
||||
GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
|
||||
|
||||
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
|
||||
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
||||
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
# if defined(__GNUC__)
|
||||
# define GGML_RESTRICT __restrict__
|
||||
# elif defined(__clang__)
|
||||
# define GGML_RESTRICT __restrict
|
||||
# elif defined(_MSC_VER)
|
||||
# define GGML_RESTRICT __restrict
|
||||
# else
|
||||
# define GGML_RESTRICT
|
||||
# endif
|
||||
#else
|
||||
#define GGML_RESTRICT restrict
|
||||
# define GGML_RESTRICT restrict
|
||||
#endif
|
||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
Reference in New Issue
Block a user