llama : tokenizer fixes (#2549)

* Merge tokenizer fixes into the gguf branch. * Add test vocabularies
2025-11-06 09:46:50 +00:00 · 2023-08-14 18:30:28 +02:00
parent 8af3a99ff1
commit ec1b100720
17 changed files with 612 additions and 147 deletions
--- a/llama.cpp
+++ b/llama.cpp
@@ -7,6 +7,7 @@
 #endif

 #include "llama-util.h"
+#define LLAMA_API_CPP // TODO: eliminate me
 #include "llama.h"

 #include "ggml.h"
@@ -575,6 +576,7 @@ struct llama_file_loader {
            float score = 0.0f;
            file.read_raw(&score, sizeof(score));

+            GGML_ASSERT(vocab.token_to_id.find(word) == vocab.token_to_id.end());
            vocab.token_to_id[word] = i;

            auto & tok_score = vocab.id_to_token[i];
@@ -1060,6 +1062,11 @@ static void llama_model_load_internal(
    std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));

    vocab = std::move(ml->file_loader->vocab);
+
+    if (vocab_only) {
+        return;
+    }
+
    model.hparams = ml->file_loader->hparams;
    model.n_gpu_layers = n_gpu_layers;
    llama_file_version file_version = ml->file_loader->file_version;
@@ -1141,10 +1148,6 @@ static void llama_model_load_internal(
        }
    }

-    if (vocab_only) {
-        return;
-    }
-
    auto & ctx = model.ctx;

    size_t ctx_size;
@@ -1940,6 +1943,105 @@ static bool llama_eval_internal(
 // tokenizer
 //

+static std::string llama_vocab_type(const llama_vocab& vocab) {
+    return vocab.token_to_id.size() == 32000 ? "spm": "bpe";
+}
+
+static bool llama_is_normal_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return token >= 259;
+    else if(llama_vocab_type(vocab) == "bpe")
+        return token >= 95;
+    else
+        return false;
+}
+
+static bool llama_is_unknown_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return token == 0;
+    else
+        // TODO: improve?
+        return false;
+}
+
+static bool llama_is_control_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return token == 1 || token == 2;
+    else
+        // TODO: improve?
+        return false;
+}
+
+static bool llama_is_bos_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return token == 1;
+    else
+        // TODO: improve?
+        return false;
+}
+
+static bool llama_is_eos_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return token == 2;
+    else
+        // TODO: improve?
+        return false;
+}
+
+static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token token) {
+    // TODO: improve?
+    return false;
+}
+
+static bool llama_is_unused_token(const llama_vocab& vocab, llama_token token) {
+    // TODO: improve?
+    return false;
+}
+
+static bool llama_is_byte_token(const llama_vocab& vocab, llama_token token) {
+    if(llama_vocab_type(vocab) == "spm")
+        return 3 <= token && token < 259;
+    else if(llama_vocab_type(vocab) == "bpe")
+        return 1 <= token && token < 95;
+    else
+        return false;
+}
+
+static uint8_t llama_byte_to_char(const llama_vocab& vocab, uint8_t byte) {
+    if(llama_vocab_type(vocab) == "spm")
+        return byte + 3;
+    else if(llama_vocab_type(vocab) == "bpe")
+        return byte + 32;
+    else
+        return false;
+}
+
+static std::string llama_escape_whitespace(const std::string& text) {
+    std::string result;
+    bool escaping = false;
+    result += "\xe2\x96\x81";
+    for (size_t offs = 0; offs < text.length(); ++offs) {
+        if (text[offs] == ' ') {
+            if (!escaping) {
+                result += "\xe2\x96\x81";
+                escaping = true;
+            }
+        }
+        else {
+            escaping = false;
+            result += text[offs];
+        }
+    }
+    return result;
+}
+
+static std::string llama_unescape_whitespace(const std::string& word) {
+    if (word.length() >= 3 && word.substr(0, 3) == "\xe2\x96\x81") {
+        return std::string(" ") + word.substr(3);
+    }
+    return word;
+}
+
 static size_t utf8_len(char src) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
@@ -1981,10 +2083,11 @@ struct llama_tokenizer {
        size_t offs = 0;
        while (offs < text.size()) {
            llama_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+            size_t len = utf8_len(text[offs]);
+            GGML_ASSERT(offs + len <= text.size());
            sym.text = text.c_str() + offs;
-            sym.n = char_len;
-            offs += char_len;
+            sym.n = len;
+            offs += len;
            sym.prev = index - 1;
            sym.next = offs == text.size() ? -1 : index + 1;
            index++;
@@ -2029,23 +2132,36 @@ struct llama_tokenizer {

        for (int i = 0; i != -1; i = symbols_[i].next) {
            auto & symbol = symbols_[i];
-            auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
-
-            if (token == vocab_.token_to_id.end()) {
-                // output any symbols that did not form tokens as bytes.
-                for (int j = 0; j < (int) symbol.n; ++j) {
-                    // NOTE: old version, before #2420 - not sure what are the implications of this
-                    //llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
-                    llama_vocab::id token_id = vocab_.token_to_id.at(std::string(1, symbol.text[j]));
-                    output.push_back(token_id);
-                }
-            } else {
-                output.push_back((*token).second);
-            }
+            resegment(symbol, output);
        }
    }

 private:
+    void resegment(llama_sp_symbol &symbol, std::vector<llama_vocab::id> &output) {
+        auto text = std::string(symbol.text, symbol.n);
+        auto token = vocab_.token_to_id.find(text);
+
+        // Do we need to support is_unused?
+        if (token != vocab_.token_to_id.end()) {
+            output.push_back((*token).second);
+            return;
+        }
+
+        const auto p = rev_merge.find(text);
+
+        if (p == rev_merge.end()) {
+            // output any symbols that did not form tokens as bytes.
+            for (int j = 0; j < (int)symbol.n; ++j) {
+                llama_vocab::id token_id = llama_byte_to_char(vocab_, symbol.text[j]);
+                output.push_back(token_id);
+            }
+            return;
+        }
+
+        resegment(symbols_[p->second.first], output);
+        resegment(symbols_[p->second.second], output);
+    }
+
    void try_add_bigram(int left, int right) {
        if (left == -1 || right == -1) {
            return;
@@ -2070,18 +2186,22 @@ private:
        bigram.score = tok_score.score;
        bigram.size = text.size();
        work_queue_.push(bigram);
+
+        // Do we need to support is_unused?
+        rev_merge[text] = std::make_pair(left, right);
    }

    const llama_vocab & vocab_;
    std::vector<llama_sp_symbol> symbols_;
    llama_sp_bigram::queue work_queue_;
+    std::map<std::string, std::pair<int, int> > rev_merge;
 };

-static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
+static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
    llama_tokenizer tokenizer(vocab);
    std::vector<llama_vocab::id> output;

-    if (text.empty()) {
+    if (raw_text.empty()) {
        return output;
    }

@@ -2089,6 +2209,13 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
        output.push_back(llama_token_bos());
    }

+    std::string text;
+    if (escape) {
+        text = llama_escape_whitespace(raw_text);
+    } else {
+        text = raw_text;
+    }
+
    tokenizer.tokenize(text, output);
    return output;
 }
@@ -2670,15 +2797,15 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c

    for (size_t i = 0; i < candidates->size; ++i) {
        const llama_token id  = candidates->data[i].id;
-        const char *      str = llama_token_to_str(ctx, id);
+        std::string       str = llama_token_to_str(ctx, id);
        if (id == eos) {
            if (!allow_eos) {
                candidates->data[i].logit = -INFINITY;
            }
-        } else if (*str == 0) {
+        } else if (str.empty()) {
            candidates->data[i].logit = -INFINITY;
        } else {
-            candidates_decoded.push_back(decode_utf8(str));
+            candidates_decoded.push_back(decode_utf8(str.c_str()));
            candidates_grammar.push_back({ i, candidates_decoded.back().data() });
        }
    }
@@ -2879,9 +3006,9 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
        LLAMA_ASSERT(false);
    }

-    const char * str = llama_token_to_str(ctx, token);
+    std::string str = llama_token_to_str(ctx, token);
    // Note terminating 0 in decoded string
-    auto code_points = decode_utf8(str);
+    auto code_points = decode_utf8(str.c_str());
    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
        grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
    }
@@ -4132,7 +4259,8 @@ int llama_tokenize_with_model(
                 llama_token * tokens,
                         int   n_max_tokens,
                        bool   add_bos) {
-    auto res = llama_tokenize(model->vocab, text, add_bos);
+    auto escape = llama_vocab_type(model->vocab) == "spm";
+    auto res = llama_tokenize(model->vocab, text, add_bos, escape);

    if (n_max_tokens < (int) res.size()) {
        LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
@@ -4155,6 +4283,62 @@ int llama_tokenize(
    return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
 }

+std::vector<llama_token> llama_tokenize(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    int length = text.length() + add_bos;
+    std::vector<llama_token> result(length);
+    length = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (length < 0) {
+        result.resize(-length);
+        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        assert(check == -length);
+        GGML_UNUSED(check);
+    } else {
+        result.resize(length);
+    }
+    return result;
+}
+
+int llama_tokenize_bpe(
+        struct llama_context * ctx,
+                  const char * text,
+                 llama_token * tokens,
+                         int   n_max_tokens,
+                        bool   add_bos) {
+    auto res = llama_tokenize(ctx->model.vocab, text, add_bos, false);
+
+    if (n_max_tokens < (int) res.size()) {
+        LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+        return -((int) res.size());
+    }
+
+    for (size_t i = 0; i < res.size(); i++) {
+        tokens[i] = res[i];
+    }
+
+    return res.size();
+}
+
+std::vector<llama_token> llama_tokenize_bpe(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    int length = text.length() + add_bos;
+    std::vector<llama_token> result(length);
+    length = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (length < 0) {
+        result.resize(-length);
+        int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        assert(check == -length);
+        GGML_UNUSED(check);
+    } else {
+        result.resize(length);
+    }
+    return result;
+}
+
 int llama_n_vocab_from_model(const struct llama_model * model) {
    return model->vocab.id_to_token.size();
 }
@@ -4208,16 +4392,88 @@ float * llama_get_embeddings(struct llama_context * ctx) {
    return ctx->embedding.data();
 }

-const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
-    if (token >= llama_n_vocab_from_model(model)) {
-        return nullptr;
+int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * str, int length) {
+    if (0 <= token && token < llama_n_vocab_from_model(model)) {
+        if (llama_is_normal_token(model->vocab, token)) {
+            std::string result = model->vocab.id_to_token[token].tok;
+            if(llama_vocab_type(model->vocab) == "spm") {
+                result = llama_unescape_whitespace(result);
+            }
+            if(result.length() > length) {
+                return - result.length();
+            }
+            strcpy(str, result.c_str());
+            return result.length();
+        } else if (llama_is_unknown_token(model->vocab, token)) {
+            if(3 > length) {
+                return -3;
+            }
+            strcpy(str, "\xe2\x96\x85");
+            return 3;
+        } else if (llama_is_control_token(model->vocab, token)) {
+            ;
+        } else if (llama_is_byte_token(model->vocab, token)) {
+            if(1 > length) {
+                return -1;
+            }
+            str[0] = llama_byte_to_char(model->vocab, token);
+            str[1] = 0x00;
+            return 1;
+        }
    }
-
-    return model->vocab.id_to_token[token].tok.c_str();
+    return 0;
 }

-const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
-    return llama_token_to_str_with_model(&ctx->model, token);
+int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * str, int length) {
+    return llama_token_to_str_with_model(&ctx->model, token, str, length);
+}
+
+std::string llama_token_to_str(
+        const struct llama_context * ctx,
+                       llama_token   token) {
+    std::string result;
+    int length = 8;
+    result.resize(length);
+    length = llama_token_to_str(ctx, token, (char *)result.data(), result.length());
+    if (length < 0) {
+        result.resize(-length);
+        int check = llama_token_to_str(ctx, token, (char *)result.data(), result.length());
+        assert(check == -length);
+        GGML_UNUSED(check);
+    } else {
+        result.resize(length);
+    }
+    return result;
+}
+
+int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * str, int length) {
+    if (0 <= token && token < llama_n_vocab_from_model(&ctx->model)) {
+        std::string result = ctx->model.vocab.id_to_token[token].tok;
+        if (result.length() > length) {
+            return - result.length();
+        }
+        strcpy(str, result.c_str());
+        return result.length();
+    }
+    return 0;
+}
+
+std::string llama_token_to_str_bpe(
+    const struct llama_context * ctx,
+                   llama_token   token) {
+    std::string result;
+    int length = 8;
+    result.resize(length);
+    length = llama_token_to_str_bpe(ctx, token, (char*)result.data(), result.length());
+    if (length < 0) {
+        result.resize(-length);
+        int check = llama_token_to_str_bpe(ctx, token, (char*)result.data(), result.length());
+        assert(check == -length);
+        GGML_UNUSED(check);
+    } else {
+        result.resize(length);
+    }
+    return result;
 }

 llama_token llama_token_bos() {