llama : tokenizer fixes (#2549)

* Merge tokenizer fixes into the gguf branch.

* Add test vocabularies
This commit is contained in:
goerch
2023-08-14 18:30:28 +02:00
committed by GitHub
parent 8af3a99ff1
commit ec1b100720
17 changed files with 612 additions and 147 deletions

View File

@@ -633,17 +633,6 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
return "The";
}
// TODO: not great allocating this every time
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
std::vector<llama_token> res(text.size() + (int) add_bos);
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
assert(n >= 0);
res.resize(n);
return res;
}
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();