tests : use new tokenizer type API (#2692)

* Merge tokenizer fixes into the gguf branch. * Add test vocabularies * Adapt convert-new.py (and fix a clang-cl compiler error on windows) * Improved tokenizer test But does it work on MacOS? * Improve token type support - Added @klosax code to convert.py - Improved token type support in vocabulary * Exclude platform dependent tests * More sentencepiece compatibility by eliminating magic numbers * Restored accidentally removed comment * Improve commentary * Use token type API in test-tokenizer-1.cpp
2025-10-30 08:42:00 +00:00 · 2023-08-21 19:11:14 +02:00
parent 0b53b8b08d
commit 49c25cce19
2 changed files with 4 additions and 4 deletions
--- a/convert.py
+++ b/convert.py
@@ -741,6 +741,8 @@ class OutputFile:
        tokens = []
        scores = []
        toktypes = []
+        # NOTE: `all_tokens` returns the the base vocabulary and added tokens
+        # TODO: add special tokens?
        for text, score, toktype in vocab.all_tokens():
            tokens.append(text)
            scores.append(score)
@@ -751,8 +753,6 @@ class OutputFile:
        self.gguf.add_token_scores(scores)
        self.gguf.add_token_types(toktypes)

-        # TODO: added / special tokens
-
    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
        n_elements = 1
        for dim in tensor.shape: