llama : improve token type support (#2668)

* Merge tokenizer fixes into the gguf branch. * Add test vocabularies * Adapt convert-new.py (and fix a clang-cl compiler error on windows) * Improved tokenizer test But does it work on MacOS? * Improve token type support - Added @klosax code to convert.py - Improved token type support in vocabulary * Exclude platform dependent tests * More sentencepiece compatibility by eliminating magic numbers * Restored accidentally removed comment
2025-11-02 09:12:03 +00:00 · 2023-08-21 17:56:02 +02:00
parent e06cbcee73
commit 8d177eddeb
4 changed files with 94 additions and 98 deletions
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@@ -10,10 +10,6 @@
 #include <vector>
 #include <locale>

-static std::string vocab_type(llama_context * ctx) {
-    return llama_n_vocab(ctx) == 32000 ? "spm": "bpe";
-}
-
 static std::string escape_whitespace(const std::string& text) {
    std::string result;
    bool escaping = false;
@@ -91,8 +87,8 @@ int main(int argc, char **argv) {
                return 2;
            }
        } else {
-            if ((vocab_type(ctx) == "spm" && i <= 258) ||
-                (vocab_type(ctx) == "bpe" && (i == 0 || i >= 100000))) {
+            // TODO: needs access to token types
+            if (0 <= i && i < 259) {
                fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
                    __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
            } else {
@@ -103,20 +99,28 @@ int main(int argc, char **argv) {
        }
    }

-    std::wstring_convert<typename std::codecvt_utf8<wchar_t>, wchar_t> converter;
-    for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
-        std::wstring wstr(1, ch);
-        std::string str;
-        try {
-            str = converter.to_bytes(wstr);
-        } catch (std::exception & e) {
-            continue;
+#ifdef _WIN32
+    std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
+    for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
+        std::u16string u16str(1, ch);
+        std::string str = u16converter.to_bytes(u16str);
+        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
+        if (tokens.size() == 1) {
+            fprintf(stderr, "%s : info: %s tokenized to %d \n",
+                __func__, str.c_str(), tokens[0]);
        }
-        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str), false);
+    }
+
+    std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
+    for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
+        std::u32string u32str(1, ch);
+        std::string str = u32converter.to_bytes(u32str);
+        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
        if (tokens.size() == 1) {
            fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
        }
    }
+#endif

    llama_free_model(model);
    llama_free(ctx);