llama : sync gguf-llama.cpp with latest llama.cpp (#2608)

* llama : sync gguf-llama.cpp with latest llama.cpp * minor : indentation + assert * llama : refactor gguf_buffer and gguf_ctx_buffer * llama : minor
2025-10-31 08:51:55 +00:00 · 2023-08-14 16:28:44 +03:00
parent 6f64b6c0f8
commit f00780b2ee
6 changed files with 692 additions and 463 deletions
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -8,14 +8,19 @@
 #include <sstream>
 #include <fstream>
 #include <vector>
-/*
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
 template<typename T>
 static std::string to_string(const T & val) {
    std::stringstream ss;
    ss << val;
    return ss.str();
 }
-*/
+
 void gguf_ex_write_str(std::ofstream & fout, const std::string & val) {
    const int32_t n = val.size();
    fout.write((const char *) &n, sizeof(n));
@@ -377,28 +382,28 @@ bool gguf_ex_read_2(const std::string & fname) {

    struct gguf_file file(fname.c_str(), "rb");
    gguf_mmap data_mmap(&file, 0, false);
+
    const int n_tensors = gguf_get_n_tensors(ctx);

    for (int i = 0; i < n_tensors; ++i) {
-        const char * name             = gguf_get_tensor_name(ctx, i);
-        const size_t offset      = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
+        const char * name   = gguf_get_tensor_name(ctx, i);
+        const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
+
        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);

        cur->data = static_cast<char *>(data_mmap.addr) + offset;

        // print first 10 elements
-    const float * data = (const float *) cur->data;
+        const float * data = (const float *) cur->data;

        printf("%s data[:10] : ", name);
-
-        for (int j = 0; j < 10; ++j) {
+        for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
            printf("%f ", data[j]);
        }
-
        printf("\n\n");
    }

-fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
+    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));

    ggml_free(ctx_data);
    gguf_free(ctx);
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -38,6 +38,9 @@ struct ggml_metal_context;
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);

+void * ggml_metal_host_malloc(size_t n);
+void   ggml_metal_host_free  (void * data);
+
 // set the number of command buffers to use
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);

--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -224,6 +224,21 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    free(ctx);
 }

+void * ggml_metal_host_malloc(size_t n) {
+    void * data = NULL;
+    const int result = posix_memalign((void **) &data, getpagesize(), n);
+    if (result != 0) {
+        fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+
+    return data;
+}
+
+void ggml_metal_host_free(void * data) {
+    free(data);
+}
+
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
    ctx->n_cb = n_cb;
 }
--- a/gguf-llama.cpp
+++ b/gguf-llama.cpp
--- a/gguf-llama.h
+++ b/gguf-llama.h
@@ -41,10 +41,6 @@
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif

-#ifndef LLAMA_DEFAULT_RMS_EPS
-#define LLAMA_DEFAULT_RMS_EPS 5e-6f
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -74,12 +70,23 @@ extern "C" {

    typedef void (*llama_progress_callback)(float progress, void *ctx);

-   struct llama_context_params {
+    enum llama_log_level {
+        LLAMA_LOG_LEVEL_ERROR = 2,
+        LLAMA_LOG_LEVEL_WARN  = 3,
+        LLAMA_LOG_LEVEL_INFO  = 4
+    };
+
+    // Signature for logging events
+    // Note that text includes the new line character at the end for most events.
+    // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
+    // if it exists.
+    // It might not exist for progress report where '.' is output repeatedly.
+    typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
+
+    struct llama_context_params {
        uint32_t seed;         // RNG seed, -1 for random
        int32_t  n_ctx;        // text context
        int32_t  n_batch;      // prompt processing batch size
-        int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
-        float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
        int32_t  n_gpu_layers; // number of layers to store in VRAM
        int32_t  main_gpu;     // the GPU that is used for scratch and small tensors

@@ -96,6 +103,7 @@ extern "C" {

        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool low_vram;   // if true, reduce VRAM usage at the cost of performance
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool vocab_only; // only load the vocabulary, no weights
@@ -129,7 +137,7 @@ extern "C" {
    // model quantization parameters
    typedef struct llama_model_quantize_params {
        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype   ftype;    // quantize to this llama_ftype
+        enum llama_ftype ftype;      // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
    } llama_model_quantize_params;
@@ -182,6 +190,10 @@ extern "C" {
        int32_t n_eval;
    };

+    // Set callback for all future logging events.
+    // If this is not called, or NULL is supplied, everything is output on stderr.
+    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
+
    LLAMA_API int llama_max_devices();

    LLAMA_API struct llama_context_params llama_context_default_params();
--- a/gguf-util.h
+++ b/gguf-util.h
@@ -64,13 +64,6 @@ static std::string format(const char * fmt, ...) {
    return std::string(buf.data(), size);
 }

-template<typename T>
-static std::string to_string(const T & val) {
-    std::stringstream ss;
-    ss << val;
-    return ss.str();
-}
-
 // TODO: can we merge this one and gguf_context?
 struct gguf_file {
    // use FILE * so we don't have to re-open the file to mmap
@@ -474,94 +467,4 @@ struct gguf_mlock {
 #endif
 };

-// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
-struct gguf_buffer {
-    uint8_t * addr = NULL;
-    size_t size = 0;
-
-    gguf_buffer() = default;
-
-    void resize(size_t len) {
-#ifdef GGML_USE_METAL
-        free(addr);
-        int result = posix_memalign((void **) &addr, getpagesize(), len);
-        if (result == 0) {
-            memset(addr, 0, len);
-        }
-        else {
-            addr = NULL;
-        }
-#else
-        delete[] addr;
-        addr = new uint8_t[len];
-#endif
-        size = len;
-    }
-
-    ~gguf_buffer() {
-#ifdef GGML_USE_METAL
-        free(addr);
-#else
-        delete[] addr;
-#endif
-        addr = NULL;
-    }
-
-    // disable copy and move
-    gguf_buffer(const gguf_buffer&) = delete;
-    gguf_buffer(gguf_buffer&&) = delete;
-    gguf_buffer& operator=(const gguf_buffer&) = delete;
-    gguf_buffer& operator=(gguf_buffer&&) = delete;
-};
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-struct gguf_ctx_buffer {
-    uint8_t * addr = NULL;
-    bool is_cuda;
-    size_t size = 0;
-
-    gguf_ctx_buffer() = default;
-
-    void resize(size_t size) {
-        free();
-
-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
-        if (addr) {
-            is_cuda = true;
-        }
-        else {
-            // fall back to pageable memory
-            addr = new uint8_t[size];
-            is_cuda = false;
-        }
-        this->size = size;
-    }
-
-    void free() {
-        if (addr) {
-            if (is_cuda) {
-                ggml_cuda_host_free(addr);
-            }
-            else {
-                delete[] addr;
-            }
-        }
-        addr = NULL;
-    }
-
-    ~gguf_ctx_buffer() {
-        free();
-    }
-
-    // disable copy and move
-    gguf_ctx_buffer(const gguf_ctx_buffer&) = delete;
-    gguf_ctx_buffer(gguf_ctx_buffer&&) = delete;
-    gguf_ctx_buffer& operator=(const gguf_ctx_buffer&) = delete;
-    gguf_ctx_buffer& operator=(gguf_ctx_buffer&&) = delete;
-};
-#else
-typedef gguf_buffer gguf_ctx_buffer;
-#endif
-
 #endif