mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : tokenizer fixes (#2549)
* Merge tokenizer fixes into the gguf branch. * Add test vocabularies
This commit is contained in:
		| @@ -1,3 +1,4 @@ | ||||
| #define LLAMA_API_CPP // TODO: eliminate me | ||||
| #include "llama.h" | ||||
|  | ||||
| #include <cstdio> | ||||
| @@ -5,16 +6,40 @@ | ||||
| #include <map> | ||||
| #include <vector> | ||||
|  | ||||
| static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) { | ||||
|     std::string result; | ||||
|     for (int i = 0; i < tokens.size(); ++i) { | ||||
|         result += llama_token_to_str(ctx, tokens[i]); | ||||
|     } | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| static const std::map<std::string, std::vector<llama_token>> & k_tests() | ||||
| { | ||||
|     static std::map<std::string, std::vector<llama_token>> _k_tests = { | ||||
|         { "Hello World",        { 1,  10994,   2787, }, }, | ||||
|         { " Hello World",       { 1,  15043,   2787, }, }, | ||||
|         { " Hello World!",      { 1,  15043,   2787,  29991, }, }, | ||||
|         { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, }, | ||||
|         { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, }, | ||||
|         { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, }, | ||||
|     }; | ||||
|         { " ",                      {1,    259, }, }, | ||||
|         { "\t",                     { 1,    29871,   12, }, }, | ||||
|         { "\n",                     { 1,    29871,   13, }, }, | ||||
|         { "\t\n",                   { 1,    29871,   12,     13, }, }, | ||||
|         { "Hello world",            { 1,  15043,   3186, }, }, | ||||
|         { " Hello world",           { 1,  29871,  15043,   3186, }, }, | ||||
|         { "Hello World",            { 1,  15043,   2787, }, }, | ||||
|         { " Hello World",           { 1,  29871,  15043,   2787, }, }, | ||||
|         { " Hello World!",          { 1,  29871,  15043,   2787,  29991, }, }, | ||||
|         { " this is 🦙.cpp",        { 1,  29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, }, | ||||
|         { "w048 7tuijk dsdfhu",     { 1,    281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, }, | ||||
|         { "нещо на Български",      { 1,   1538,   4851,    665,   1386,  29713,   1305, }, }, | ||||
|         { "កាន់តែពិសេសអាចខលចេញ",   { 1,  29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,     | ||||
|                                         146,    228,    162,    133,    228,    161,    153,    228,    161,    186,   | ||||
|                                         31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,     | ||||
|                                         161,    136,    228,    161,    132,    228,    161,    158,    228,    161,     | ||||
|                                         136,    228,    162,    132,    228,    161,    140, }, }, | ||||
|         { "🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", | ||||
|             { 1,  29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,     | ||||
|               243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,     | ||||
|               313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,     | ||||
|               313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, }, | ||||
|      }; | ||||
|     return _k_tests; | ||||
| }; | ||||
|  | ||||
| @@ -65,9 +90,9 @@ int main(int argc, char **argv) { | ||||
|     } | ||||
|  | ||||
|     for (const auto & test_kv : k_tests()) { | ||||
|         std::vector<llama_token> res(test_kv.first.size()); | ||||
|         const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true); | ||||
|         res.resize(n); | ||||
|         std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first.c_str(), true); | ||||
|         fprintf(stderr, "%s : '%s' tokenized to '%s'\n",  | ||||
|             __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str()); | ||||
|  | ||||
|         bool correct = res.size() == test_kv.second.size(); | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 goerch
					goerch