mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : add pre-tokenizer regexes for BLOOM and gpt3-finnish (#8850)
This commit is contained in:
		| @@ -410,6 +410,8 @@ struct llm_tokenizer_bpe { | ||||
|                 }; | ||||
|                 break; | ||||
|             case LLAMA_VOCAB_PRE_TYPE_PORO: | ||||
|             case LLAMA_VOCAB_PRE_TYPE_BLOOM: | ||||
|             case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH: | ||||
|                 regex_exprs = { | ||||
|                     " ?[^(\\s|.,!?…。,、।۔،)]+", | ||||
|                 }; | ||||
|   | ||||
| @@ -5467,6 +5467,12 @@ static void llm_load_vocab( | ||||
|             } else if ( | ||||
|                 tokenizer_pre == "codeshell") { | ||||
|                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL; | ||||
|             } else if ( | ||||
|                 tokenizer_pre == "bloom") { | ||||
|                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM; | ||||
|             } else if ( | ||||
|                 tokenizer_pre == "gpt3-finnish") { | ||||
|                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH; | ||||
|             } else { | ||||
|                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); | ||||
|             } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Esko Toivonen
					Esko Toivonen