mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
Detokenizer fixes (#8039)
* Add llama_detokenize():
- Update header files location
- UNKNOWN and CONTROL are 'special pieces'
- Remove space after UNKNOWN and CONTROL
- Refactor llama_token_to_piece()
- Add flag: clean_up_tokenization_spaces
- Symmetric params for llama_tokenize() and llama_detokenize()
* Update and fix tokenizer tests:
- Using llama_detokenize()
- Unexpected vocab type as test fail instead of error
- Useful when automating tests:
- If you don't know in advance the vocab type
- Differenciate other loading errors
- Skip unicode surrogaes and undefined
- Gracefully exit threads
- Using exit() is throwing random exceptions
- Clean old known problematic codepoints
- Minor: confusing hexadecimal codepoint
* Update bruteforce random tests
- Add detokenizer checks
- New generator: ascii_lr_strip
- New generator: apostrophe
- Add more vocabs files
- Detokenize special tokens.
- Replace errors with '\uFFFD' when detokenizing to 'utf-8'
- More edge cases
- Better detokenization results check
* Fix add_space_prefix, set false by default
* Better leading space removal
* Do not remove space when decoding special tokens
* Bugfix: custom regexs splits undefined unicode codepoints
* 'viking' detokenizer clean spaces
This commit is contained in:
@@ -2592,51 +2592,35 @@ std::vector<llama_token> llama_tokenize(
|
||||
}
|
||||
|
||||
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
||||
std::vector<char> result(8, 0);
|
||||
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
std::string piece;
|
||||
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
||||
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
||||
if (n_chars < 0) {
|
||||
piece.resize(-n_chars);
|
||||
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
||||
GGML_ASSERT(check == -n_chars);
|
||||
}
|
||||
else {
|
||||
piece.resize(n_chars);
|
||||
}
|
||||
|
||||
return std::string(result.data(), result.size());
|
||||
return piece;
|
||||
}
|
||||
|
||||
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
|
||||
|
||||
std::string piece;
|
||||
std::string result;
|
||||
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
piece = llama_token_to_piece(ctx, tokens[i]);
|
||||
|
||||
// remove the leading space of the first non-BOS token
|
||||
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
|
||||
piece = piece.substr(1);
|
||||
}
|
||||
|
||||
result += piece;
|
||||
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
||||
std::string text;
|
||||
text.resize(std::max(text.capacity(), tokens.size()));
|
||||
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||
if (n_chars < 0) {
|
||||
text.resize(-n_chars);
|
||||
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||
std::string piece;
|
||||
std::string result;
|
||||
|
||||
for (size_t i = 0; i < tokens.size(); ++i) {
|
||||
piece = llama_token_to_piece(ctx, tokens[i]);
|
||||
|
||||
result += piece;
|
||||
}
|
||||
text.resize(n_chars);
|
||||
|
||||
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
|
||||
return result;
|
||||
return text;
|
||||
}
|
||||
|
||||
bool llama_should_add_bos_token(const llama_model * model) {
|
||||
|
||||
Reference in New Issue
Block a user