mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	vocab : JetBrains Mellum pre-tokenizer (#15045)
This commit is contained in:
		| @@ -852,6 +852,9 @@ class TextModel(ModelBase): | |||||||
|         if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb": |         if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb": | ||||||
|             # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B |             # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B | ||||||
|             res = "exaone4" |             res = "exaone4" | ||||||
|  |         if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756": | ||||||
|  |             # ref: https://huggingface.co/JetBrains/Mellum-4b-base | ||||||
|  |             res = "mellum" | ||||||
|  |  | ||||||
|         if res is None: |         if res is None: | ||||||
|             logger.warning("\n") |             logger.warning("\n") | ||||||
|   | |||||||
| @@ -138,6 +138,7 @@ models = [ | |||||||
|     {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", }, |     {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", }, | ||||||
|     {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"}, |     {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"}, | ||||||
|     {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", }, |     {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", }, | ||||||
|  |     {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", }, | ||||||
| ] | ] | ||||||
|  |  | ||||||
| # some models are known to be broken upstream, so we will skip them as exceptions | # some models are known to be broken upstream, so we will skip them as exceptions | ||||||
|   | |||||||
| @@ -1856,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { | |||||||
|                     tokenizer_pre == "gigachat"   || |                     tokenizer_pre == "gigachat"   || | ||||||
|                     tokenizer_pre == "jina-v2-es" || |                     tokenizer_pre == "jina-v2-es" || | ||||||
|                     tokenizer_pre == "jina-v2-de" || |                     tokenizer_pre == "jina-v2-de" || | ||||||
|                     tokenizer_pre == "a.x-4.0") { |                     tokenizer_pre == "a.x-4.0" || | ||||||
|  |                     tokenizer_pre == "mellum") { | ||||||
|                 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; |                 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; | ||||||
|             } else if ( |             } else if ( | ||||||
|                     tokenizer_pre == "jina-v1-en" || |                     tokenizer_pre == "jina-v1-en" || | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Csaba Kecskemeti
					Csaba Kecskemeti