llama : improve token type support (#2668)

* Merge tokenizer fixes into the gguf branch.

* Add test vocabularies

* Adapt convert-new.py (and fix a clang-cl compiler error on windows)

* Improved tokenizer test

But does it work on MacOS?

* Improve token type support

- Added @klosax code to convert.py
- Improved token type support in vocabulary

* Exclude platform dependent tests

* More sentencepiece compatibility by eliminating magic numbers

* Restored accidentally removed comment
This commit is contained in:
goerch
2023-08-21 17:56:02 +02:00
committed by GitHub
parent e06cbcee73
commit 8d177eddeb
4 changed files with 94 additions and 98 deletions

View File

@@ -10,10 +10,6 @@
#include <vector>
#include <locale>
static std::string vocab_type(llama_context * ctx) {
return llama_n_vocab(ctx) == 32000 ? "spm": "bpe";
}
static std::string escape_whitespace(const std::string& text) {
std::string result;
bool escaping = false;
@@ -91,8 +87,8 @@ int main(int argc, char **argv) {
return 2;
}
} else {
if ((vocab_type(ctx) == "spm" && i <= 258) ||
(vocab_type(ctx) == "bpe" && (i == 0 || i >= 100000))) {
// TODO: needs access to token types
if (0 <= i && i < 259) {
fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
} else {
@@ -103,20 +99,28 @@ int main(int argc, char **argv) {
}
}
std::wstring_convert<typename std::codecvt_utf8<wchar_t>, wchar_t> converter;
for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
std::wstring wstr(1, ch);
std::string str;
try {
str = converter.to_bytes(wstr);
} catch (std::exception & e) {
continue;
#ifdef _WIN32
std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
std::u16string u16str(1, ch);
std::string str = u16converter.to_bytes(u16str);
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
if (tokens.size() == 1) {
fprintf(stderr, "%s : info: %s tokenized to %d \n",
__func__, str.c_str(), tokens[0]);
}
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str), false);
}
std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
std::u32string u32str(1, ch);
std::string str = u32converter.to_bytes(u32str);
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
if (tokens.size() == 1) {
fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
}
}
#endif
llama_free_model(model);
llama_free(ctx);