Make WIN32 mmap() improvements (#341)

Still not fully working yet. Closes #341
2025-11-06 09:46:50 +00:00 · 2023-03-21 01:46:44 +04:00
parent 0b5448a3a4
commit e4881686b4
5 changed files with 565 additions and 22 deletions
--- a/main.cpp
+++ b/main.cpp
@@ -1,3 +1,7 @@
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#define NOMINMAX
+#endif
+
 #include "ggml.h"

 #include "utils.h"
@@ -19,6 +23,10 @@
 #include <unistd.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
+#else
+#include <errno.h>
+#define msync(addr, len_bytes, flag) winMSync
+#define MS_ASYNC 0
 #endif

 #define ROUNDUP(X, K) (((X) + (K)-1) & -(K))
@@ -96,6 +104,7 @@ struct llama_model {
    std::map<std::string, struct ggml_tensor *> tensors;
 };

+
 struct magic {
    uint32_t magic;
    std::atomic<unsigned> lock;
@@ -103,10 +112,37 @@ struct magic {
    size_t commit;
    size_t offset;
    size_t capacity;
-    gpt_vocab *vocab;
-    llama_model *model;
+    gpt_vocab* vocab;
+    llama_model* model;
 };

+static void winMSync(magic* addr, size_t len_bytes) {
+    bool success = FlushViewOfFile((void*)addr, len_bytes);
+    if (!success) {
+        LPVOID lpMsgBuf;
+        LPVOID lpDisplayBuf;
+        DWORD error_code = GetLastError();
+        FormatMessage(
+            FORMAT_MESSAGE_ALLOCATE_BUFFER |
+            FORMAT_MESSAGE_FROM_SYSTEM |
+            FORMAT_MESSAGE_IGNORE_INSERTS,
+            NULL,
+            error_code,
+            MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+            (LPTSTR)&lpMsgBuf,
+            0, NULL);
+        lpDisplayBuf = (LPVOID)LocalAlloc(LMEM_ZEROINIT,
+            (lstrlen((LPCTSTR)lpMsgBuf) + 40) * sizeof(TCHAR));
+        StringCchPrintf((LPTSTR)lpDisplayBuf,
+            LocalSize(lpDisplayBuf) / sizeof(TCHAR),
+            TEXT("failed with error %d: %s"),
+            error_code, lpMsgBuf);
+    }
+    HANDLE hFile = (HANDLE)_get_osfhandle(addr->fd);
+    FlushFileBuffers(hFile);
+}
+
+
 static struct magic *mag;

 static inline void spin_lock(std::atomic<unsigned> &lock) {
@@ -129,17 +165,26 @@ static void magic_commit(void) {
    mag->offset = mag->capacity;
    mag->commit = mag->capacity;
    mag->magic = 0xFEEDABEE;
-    msync(mag, mag->commit, MS_ASYNC);
+    bool success = msync(mag, mag->commit, MS_ASYNC);   
 }

 static void magic_init(void) {
    int fd;
    size_t n;
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
    struct stat st;
+#else
+    struct _stat64 st;
+#endif
    if (mag) return;
    n = ROUNDUP(sizeof(struct magic), MAGIC_GRAN);
    if ((fd = open(MAGIC_PATH, O_RDWR)) != -1) {
-        fstat(fd, &st);
+        int result = fstat(fd, &st);
+        int error = errno;
+        if (errno == EBADF)
+            fprintf(stderr, "Bad file descriptor.\n");
+        else if (errno == EINVAL)
+            fprintf(stderr, "Invalid argument to _fstat.\n");
        if (st.st_size >= n) {
            mag = (struct magic *)Mmap(MAGIC_ADDR, n,
                                       PROT_READ | PROT_WRITE,
@@ -182,9 +227,9 @@ void *memalign(size_t a, size_t n) {
    i = i + sizeof(size_t);
    i = ROUNDUP(i, a);
    j = ROUNDUP(i + m, MAGIC_GRAN);
-    if (j > mag->capacity) {
+    //if (j > mag->capacity) {
        if (!mag->magic) {
-            ftruncate(mag->fd, j);
+            int result = ftruncate(mag->fd, j);
            p = mmap(MAGIC_ADDR + mag->capacity,
                     j - mag->capacity, PROT_READ | PROT_WRITE,
                     MAP_SHARED | MAP_FIXED, mag->fd, mag->capacity);
@@ -199,7 +244,7 @@ void *memalign(size_t a, size_t n) {
            spin_unlock(mag->lock);
            return 0;
        }
-    }
+    //}
    mag->offset = i + m;
    spin_unlock(mag->lock);
    p = MAGIC_ADDR + i;
@@ -207,7 +252,7 @@ void *memalign(size_t a, size_t n) {
    return p;
 }

-void *malloc(size_t n) {
+void *_malloc(size_t n) {
    return memalign(MAGIC_ALGN, n);
 }

@@ -215,33 +260,53 @@ size_t malloc_usable_size(const void *p) {
    return ((const size_t *)p)[-1];
 }

-void *calloc(size_t n, size_t z) {
+void *_calloc(size_t n, size_t z) {
    void *p;
-    if ((p = malloc((n *= z)))) {
+    if ((p = _malloc((n *= z)))) {
        memset(p, 0, n);
    }
    return p;
 }

-void free(void *p) {
+void _free(void *p) {
    // do nothing
 }

-void *realloc(void *p, size_t n) {
+void *_realloc(void *p, size_t n) {
    void *q;
    if (!p) {
-        return malloc(n);
+        return _malloc(n);
    }
    if (!n) {
-        free(p);
+        _free(p);
        return 0;
    }
-    if ((q = malloc(n))) {
+    if ((q = _malloc(n))) {
        memcpy(q, p, ((const size_t *)p)[-1]);
    }
    return q;
 }

+#if defined(malloc)
+# undef malloc
+#endif
+#define malloc(x) _malloc(x)
+
+#if defined(calloc)
+# undef calloc
+#endif
+#define calloc(x) _calloc(x)
+
+#if defined(realloc)
+# undef realloc
+#endif
+#define realloc(x) _realloc(x)
+
+#if defined(free)
+# undef free
+#endif
+#define free(x) _free(x)
+
 // load the model's weights from a file
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
@@ -707,7 +772,7 @@ bool llama_eval(
    const int d_key = n_embd/n_head;

    static size_t buf_size = 512u*1024*1024;
-    static void * buf = malloc(buf_size);
+    static void * buf = _malloc(buf_size);

    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
@@ -715,7 +780,7 @@ bool llama_eval(

        // reallocate
        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
+        buf = _realloc(buf, buf_size);
        if (buf == nullptr) {
            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
            return false;