Merge branch 'master' into gguf

2025-11-04 09:32:00 +00:00 · 2023-08-18 12:38:05 +03:00
parent aa3efe87c8 604b8bdfa6
commit 856afff746
6 changed files with 1717 additions and 979 deletions
--- a/README.md
+++ b/README.md
@@ -238,12 +238,17 @@ In order to build llama.cpp you have three different options.
    cmake --build . --config Release
    ```
- Using `Zig`:
+- Using `Zig` (version 0.11 or later):
    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
    it's also possible to cross compile for other operating systems and architectures:
    ```bash
-    zig build -Doptimize=ReleaseFast
+    zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
    ```
    The `zig targets` command will give you valid options to use.
 -   Using `gmake` (FreeBSD):
    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@@ -408,7 +413,7 @@ Building the program with BLAS support may lead to some performance improvements
  |-------------------------|------------------------|---------|-------------|
  | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
  | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
--- a/build.zig
+++ b/build.zig
@@ -1,5 +1,6 @@
 // Compatible with Zig Version 0.11.0
 const std = @import("std");
 const ArrayList = std.ArrayList;
 const Compile = std.Build.Step.Compile;
 const ConfigHeader = std.Build.Step.ConfigHeader;
 const Mode = std.builtin.Mode;
@@ -10,11 +11,31 @@ const Maker = struct {
    target: CrossTarget,
    optimize: Mode,
    config_header: *ConfigHeader,
    enable_lto: bool,
-    const cflags = .{"-std=c11"};
+    include_dirs: ArrayList([]const u8),
-    const cxxflags = .{"-std=c++11"};
+    cflags: ArrayList([]const u8),
    cxxflags: ArrayList([]const u8),
    objs: ArrayList(*Compile),
-    fn init(builder: *std.build.Builder) Maker {
+    fn addInclude(m: *Maker, dir: []const u8) !void {
        try m.include_dirs.append(dir);
    }
    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
    }
    fn addCFlag(m: *Maker, flag: []const u8) !void {
        try m.cflags.append(flag);
    }
    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
        try m.cxxflags.append(flag);
    }
    fn addFlag(m: *Maker, flag: []const u8) !void {
        try m.addCFlag(flag);
        try m.addCxxFlag(flag);
    }
    fn init(builder: *std.build.Builder) !Maker {
        const commit_hash = @embedFile(".git/refs/heads/master");
        const config_header = builder.addConfigHeader(
            .{ .style = .blank, .include_path = "build-info.h" },
@@ -23,58 +44,71 @@ const Maker = struct {
                .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
            },
        );
-        return Maker{
+        var m = Maker{
            .builder = builder,
            .target = builder.standardTargetOptions(.{}),
            .optimize = builder.standardOptimizeOption(.{}),
            .config_header = config_header,
            .enable_lto = false,
            .include_dirs = ArrayList([]const u8).init(builder.allocator),
            .cflags = ArrayList([]const u8).init(builder.allocator),
            .cxxflags = ArrayList([]const u8).init(builder.allocator),
            .objs = ArrayList(*Compile).init(builder.allocator),
        };
        try m.addCFlag("-std=c11");
        try m.addCxxFlag("-std=c++11");
        try m.addProjectInclude(&.{});
        try m.addProjectInclude(&.{"examples"});
        return m;
    }
    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
        if (std.mem.endsWith(u8, src, ".c")) {
-            o.addCSourceFiles(&.{src}, &cflags);
+            o.addCSourceFiles(&.{src}, m.cflags.items);
            o.linkLibC();
        } else {
-            o.addCSourceFiles(&.{src}, &cxxflags);
+            o.addCSourceFiles(&.{src}, m.cxxflags.items);
            o.linkLibCpp();
        }
-        o.addIncludePath(.{ .path = "." });
+        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
-        o.addIncludePath(.{ .path = "./examples" });
+        o.want_lto = m.enable_lto;
        return o;
    }
    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
-        e.addIncludePath(.{ .path = "." });
+        e.addCSourceFiles(&.{src}, m.cxxflags.items);
        e.addIncludePath(.{ .path = "./examples" });
        e.addCSourceFiles(&.{src}, &cxxflags);
        for (deps) |d| e.addObject(d);
        for (m.objs.items) |o| e.addObject(o);
        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
        e.linkLibC();
        e.linkLibCpp();
        e.addConfigHeader(m.config_header);
        m.builder.installArtifact(e);
-
+        e.want_lto = m.enable_lto;
        // Currently a bug is preventing correct linking for optimized builds for Windows:
        // https://github.com/ziglang/zig/issues/15958
        if (e.target.isWindows()) {
            e.want_lto = false;
        }
        return e;
    }
 };
-pub fn build(b: *std.build.Builder) void {
+pub fn build(b: *std.build.Builder) !void {
-    const make = Maker.init(b);
+    var make = try Maker.init(b);
    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
    if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
        try make.addFlag("-DGGML_USE_K_QUANTS");
        const k_quants = make.obj("k_quants", "k_quants.c");
        try make.objs.append(k_quants);
    }
    const ggml = make.obj("ggml", "ggml.c");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const llama = make.obj("llama", "llama.cpp");
    const common = make.obj("common", "examples/common.cpp");
    const console = make.obj("common", "examples/console.cpp");
    const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -170,6 +170,136 @@
      grammar: '',
    })
    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
    const local_storage_storageKey = "llamacpp_server_local_storage";
    function local_storage_setDataFromObject(tag, content) {
      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
    }
    function local_storage_setDataFromRawText(tag, content) {
      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
    }
    function local_storage_getDataAsObject(tag) {
      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
      if (!item) {
        return null;
      } else {
        return JSON.parse(item);
      }
    }
    function local_storage_getDataAsRawText(tag) {
      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
      if (!item) {
        return null;
      } else {
        return item;
      }
    }
    // create a container for user templates and settings
    const savedUserTemplates = signal({})
    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
    // let's import locally saved templates and settings if there are any
    // user templates and settings are stored in one object
    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
    console.log('Importing saved templates')
    let importedTemplates = local_storage_getDataAsObject('user_templates')
    if (importedTemplates) {
      // saved templates were successfuly imported.
      console.log('Processing saved templates and updating default template')
      //console.log(importedTemplates);
      savedUserTemplates.value = importedTemplates;
      //override default template
      savedUserTemplates.value.default = { session: session.value, params: params.value }
      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
    } else {
      // no saved templates detected.
      console.log('Initializing LocalStorage and saving default template')
      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
    }
    function userTemplateResetToDefault() {
      console.log('Reseting themplate to default')
      selectedUserTemplate.value.name = 'default';
      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
    }
    function userTemplateApply(t) {
      session.value = t.data.session;
      params.value = t.data.params;
    }
    function userTemplateResetToDefaultAndApply() {
      userTemplateResetToDefault()
      userTemplateApply(selectedUserTemplate.value)
    }
    function userTemplateLoadAndApplyAutosaved() {
      // get autosaved last used template
      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
      if (lastUsedTemplate) {
        console.log('Autosaved template found, restoring')
        selectedUserTemplate.value = lastUsedTemplate
      }
      else {
        console.log('No autosaved template found, using default template')
        // no autosaved last used template was found, so load from default.
        userTemplateResetToDefault()
      }
      console.log('Applying template')
      // and update internal data from templates
      userTemplateApply(selectedUserTemplate.value)
    }
    //console.log(savedUserTemplates.value)
    //console.log(selectedUserTemplate.value)
    function userTemplateAutosave() {
      console.log('Template Autosave...')
      if (selectedUserTemplate.value.name == 'default') {
        // we don't want to save over default template, so let's create a new one
        let newTemplateName = 'UserTemplate-' + Date.now().toString()
        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
        console.log('Saving as ' + newTemplateName)
        // save in the autosave slot
        local_storage_setDataFromObject('user_templates_last', newTemplate)
        // and load it back and apply
        userTemplateLoadAndApplyAutosaved()
      } else {
        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
      }
    }
    console.log('Checking for autosaved last used template')
    userTemplateLoadAndApplyAutosaved()
    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
    const llamaStats = signal(null)
    const controller = signal(null)
@@ -346,8 +476,34 @@
        `
      };
      const userTemplateReset = (e) => {
        e.preventDefault();
        userTemplateResetToDefaultAndApply()
      }
      const UserTemplateResetButton = () => {
        if (selectedUserTemplate.value.name == 'default') {
          return html`
            <button disabled>Using default template</button>
          `
        }
        return html`
          <button onclick=${userTemplateReset}>Reset all to default</button>
        `
      };
      useEffect(() => {
        // autosave template on every change
        userTemplateAutosave()
      }, [session.value, params.value])
      return html`
        <form>
          <fieldset>
            <${UserTemplateResetButton}/>
          </fieldset>
          <fieldset>
            <div>
              <label for="prompt">Prompt</label>
--- a/llama.cpp
+++ b/llama.cpp
@@ -2574,37 +2574,81 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
 // grammar - internal
 //
 struct llama_partial_utf8 {
    uint32_t value;    // bit value so far (unshifted)
    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
 };
 struct llama_grammar {
    const std::vector<std::vector<llama_grammar_element>>   rules;
    std::vector<std::vector<const llama_grammar_element *>> stacks;
    // buffer for partially generated UTF-8 sequence from accepted tokens
    llama_partial_utf8                                      partial_utf8;
 };
 struct llama_grammar_candidate {
    size_t               index;
    const uint32_t     * code_points;
    llama_partial_utf8   partial_utf8;
 };
-// NOTE: assumes valid utf8 (but checks for overrun)
+// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
-// adds a terminating 0 for use as pointer
+// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
-std::vector<uint32_t> decode_utf8(const char * src) {
+std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
-    static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+        const char         * src,
        llama_partial_utf8   partial_start) {
    static const int      lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
    const char          * pos      = src;
    std::vector<uint32_t> code_points;
    uint32_t              value    = partial_start.value;
    int                   n_remain = partial_start.n_remain;
    // continue previous decode, if applicable
    while (*pos != 0 && n_remain > 0) {
        uint8_t next_byte = static_cast<uint8_t>(*pos);
        if ((next_byte >> 6) != 2) {
            // invalid sequence, abort
            code_points.push_back(0);
            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
        }
        value = (value << 6) + (next_byte & 0x3F);
        ++pos;
        --n_remain;
    }
    if (partial_start.n_remain > 0 && n_remain == 0) {
        code_points.push_back(value);
    }
    // decode any subsequent utf-8 sequences, which may end in an incomplete one
    while (*pos != 0) {
        uint8_t  first_byte = static_cast<uint8_t>(*pos);
        uint8_t  highbits   = first_byte >> 4;
-        int      len        = lookup[highbits];
+                 n_remain   = lookup[highbits] - 1;
-        uint8_t  mask       = (1 << (8 - len)) - 1;
+
-        uint32_t value      = first_byte & mask;
+        if (n_remain < 0) {
-        const char * end    = pos + len; // may overrun!
+            // invalid sequence, abort
-        ++pos;
+            code_points.clear();
-        for ( ; pos < end && *pos != 0; ++pos) {
+            code_points.push_back(0);
-            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
+            return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
        }
        uint8_t  mask       = (1 << (7 - n_remain)) - 1;
                 value      = first_byte & mask;
        ++pos;
        while (*pos != 0 && n_remain > 0) {
            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
            ++pos;
            --n_remain;
        }
        if (n_remain == 0) {
            code_points.push_back(value);
        }
    }
    code_points.push_back(0);
-    return code_points;
+
    return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
 }
 // returns true iff pos points to the end of one of the definitions of a rule
@@ -2641,6 +2685,56 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
    return std::make_pair(found == is_positive_char, pos);
 }
 // returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
 // range at pos (regular or inverse range)
 // asserts that pos is pointing to a char range element
 static bool llama_grammar_match_partial_char(
        const llama_grammar_element * pos,
        const llama_partial_utf8      partial_utf8) {
    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
    GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
    uint32_t partial_value = partial_utf8.value;
    int      n_remain      = partial_utf8.n_remain;
    // invalid sequence or 7-bit char split across 2 bytes (overlong)
    if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
        return false;
    }
    // range of possible code points this partial UTF-8 sequence could complete to
    uint32_t low  = partial_value << (n_remain * 6);
    uint32_t high = low | ((1 << (n_remain * 6)) - 1);
    if (low == 0) {
        if (n_remain == 2) {
            low = 1 << 11;
        } else if (n_remain == 3) {
            low = 1 << 16;
        }
    }
    do {
        if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
            // inclusive range, e.g. [a-z]
            if (pos->value <= high && low <= pos[1].value) {
                return is_positive_char;
            }
            pos += 2;
        } else {
            // exact char match, e.g. [a] or "a"
            if (low <= pos->value && pos->value <= high) {
                return is_positive_char;
            }
            pos += 1;
        }
    } while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
    return !is_positive_char;
 }
 // transforms a grammar pushdown stack into N possible stacks, all ending
 // at a character range (terminal element)
 static void llama_grammar_advance_stack(
@@ -2741,8 +2835,11 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
    std::vector<llama_grammar_candidate> rejects;
    if (stack.empty()) {
-        // accept nothing; EOS is handled elsewhere
+        for (auto tok : candidates) {
-        rejects.insert(rejects.end(), candidates.begin(), candidates.end());
+            if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
                rejects.push_back(tok);
            }
        }
        return rejects;
    }
@@ -2750,10 +2847,15 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
    std::vector<llama_grammar_candidate> next_candidates;
    for (auto tok : candidates) {
-        if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
+        if (*tok.code_points == 0) {
-            if (tok.code_points[1] != 0) {
+            // reached end of full codepoints in token, reject iff it ended in a partial sequence
-                next_candidates.push_back({ tok.index, tok.code_points + 1 });
+            // that cannot satisfy this position in grammar
            if (tok.partial_utf8.n_remain != 0 &&
                    !llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
                rejects.push_back(tok);
            }
        } else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
            next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
        } else {
            rejects.push_back(tok);
        }
@@ -2771,7 +2873,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
    auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
    for (auto tok : next_rejects) {
-        rejects.push_back({ tok.index, tok.code_points - 1 });
+        rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
    }
    return rejects;
@@ -2836,7 +2938,7 @@ struct llama_grammar * llama_grammar_init(
        }
    } while (true);
-    return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
+    return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
 }
 void llama_grammar_free(struct llama_grammar * grammar) {
@@ -3141,7 +3243,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
    const llama_token eos = llama_token_eos();
-    std::vector<std::vector<uint32_t>>   candidates_decoded;
+    std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
    std::vector<llama_grammar_candidate>                              candidates_grammar;
    for (size_t i = 0; i < candidates->size; ++i) {
@@ -3154,8 +3256,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
        } else if (str.empty()) {
            candidates->data[i].logit = -INFINITY;
        } else {
-            candidates_decoded.push_back(decode_utf8(str.c_str()));
+            candidates_decoded.push_back(decode_utf8(str.c_str(), grammar->partial_utf8));
-            candidates_grammar.push_back({ i, candidates_decoded.back().data() });
+            candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
        }
    }
@@ -3354,12 +3456,15 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
        GGML_ASSERT(false);
    }
-    std::string str = llama_token_to_str(ctx, token);
+    const std::string str = llama_token_to_str(ctx, token);
    // Note terminating 0 in decoded string
-    auto code_points = decode_utf8(str.c_str());
+    const auto   decoded     = decode_utf8(str.c_str(), grammar->partial_utf8);
    const auto & code_points = decoded.first;
    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
        grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
    }
    grammar->partial_utf8 = decoded.second;
    GGML_ASSERT(!grammar->stacks.empty());
    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -199,7 +199,7 @@ int main()
        uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
        cp[0] = 37 + i;
        cp[1] = 0;
-        next_candidates[i] = {i, cp};
+        next_candidates[i] = {i, cp, {}};
    }
    std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {