mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
llama : add support for Tekken pre-tokenizer (#8579)
* llama : Added support for Tekken pre-tokenizer (#8577) Removed uneeded `vocab.tokenizer_clean_spaces` assignment * llama : fix order of pre-tokenizers * * Tekken pre-tokenizer no longer uses clean_up_tokenization_spaces * Updated chkhsh for Tekken tokenizer --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
@@ -91,6 +91,7 @@ models = [
|
||||
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
|
||||
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
||||
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
||||
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user