mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-15 11:17:31 +00:00
ggml : add IQ2 to test-backend-ops + refactoring (#4990)
* ggml : add IQ2 to test-backend-ops + refactoring ggml-ci * cuda : update supports_op for IQ2 ggml-ci * ci : enable LLAMA_CUBLAS=1 for CUDA nodes ggml-ci * cuda : fix out-of-bounds-access in `mul_mat_vec_q` ggml-ci * tests : avoid creating RNGs for each Q tensor ggml-ci * tests : avoid creating RNGs for each tensor ggml-ci
This commit is contained in:
20
ggml.h
20
ggml.h
@@ -2065,6 +2065,18 @@ extern "C" {
|
||||
// quantization
|
||||
//
|
||||
|
||||
// - ggml_quantize_init can be called multiple times with the same type
|
||||
// it will only initialize the quantization tables for the first call or after ggml_quantize_free
|
||||
// automatically called by ggml_quantize_chunk for convenience
|
||||
//
|
||||
// - ggml_quantize_free will free any memory allocated by ggml_quantize_init
|
||||
// call this at the end of the program to avoid memory leaks
|
||||
//
|
||||
// note: these are thread-safe
|
||||
//
|
||||
GGML_API void ggml_quantize_init(enum ggml_type type);
|
||||
GGML_API void ggml_quantize_free(void);
|
||||
|
||||
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
@@ -2078,13 +2090,13 @@ extern "C" {
|
||||
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
// some quantization type cannot be used without an importance matrix
|
||||
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
||||
|
||||
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
||||
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
|
||||
// These are needed for IQ2_XS and IQ2_XXS quantizations
|
||||
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
|
||||
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
|
||||
|
||||
//
|
||||
// gguf
|
||||
//
|
||||
|
||||
Reference in New Issue
Block a user