mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Add test for MPT tokenization (#3728)
* Add test for MPT tokenization * Revert code motion * Remove unnecessary restriction in test case * Clarify logic in conversion
This commit is contained in:
		| @@ -128,15 +128,22 @@ vocab_size = hparams["vocab_size"] | |||||||
| # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py | # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py | ||||||
| tokenizer = AutoTokenizer.from_pretrained(dir_model) | tokenizer = AutoTokenizer.from_pretrained(dir_model) | ||||||
|  |  | ||||||
|  | added_vocab = tokenizer.get_added_vocab() | ||||||
| reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} | reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} | ||||||
|  |  | ||||||
| for i in range(vocab_size): | for i in range(vocab_size): | ||||||
|     tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") |     if i not in reverse_vocab: | ||||||
|     scores.append(0.0) # dummy |         tokens.append(f"[PAD{i}]") | ||||||
|     toktypes.append(gguf.TokenType.NORMAL) |         toktypes.append(gguf.TokenType.USER_DEFINED) | ||||||
|  |     elif reverse_vocab[i] in added_vocab: | ||||||
|  |         # NOTE: wouldn't we like to distinguish CONTROL tokens here? | ||||||
|  |         tokens.append(reverse_vocab[i]) | ||||||
|  |         toktypes.append(gguf.TokenType.USER_DEFINED) | ||||||
|  |     else: | ||||||
|  |         tokens.append(reverse_vocab[i]) | ||||||
|  |         toktypes.append(gguf.TokenType.NORMAL) | ||||||
|  |  | ||||||
| gguf_writer.add_token_list(tokens) | gguf_writer.add_token_list(tokens) | ||||||
| gguf_writer.add_token_scores(scores) |  | ||||||
| gguf_writer.add_token_types(toktypes) | gguf_writer.add_token_types(toktypes) | ||||||
|  |  | ||||||
| special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) | special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) | ||||||
|   | |||||||
							
								
								
									
										17
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -975,14 +975,15 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default | |||||||
|     (void) tensor; |     (void) tensor; | ||||||
| } | } | ||||||
|  |  | ||||||
| static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { | static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { | ||||||
|     std::vector<char> result(8, 0); |     std::vector<char> result(8, 0); | ||||||
|     const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); |     const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); | ||||||
|     if (n_tokens < 0) { |     if (n_tokens < 0) { | ||||||
|         result.resize(-n_tokens); |         result.resize(-n_tokens); | ||||||
|         int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); |         int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); | ||||||
|         GGML_ASSERT(check == -n_tokens); |         GGML_ASSERT(check == -n_tokens); | ||||||
|     } else { |     } | ||||||
|  |     else { | ||||||
|         result.resize(n_tokens); |         result.resize(n_tokens); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -1202,10 +1203,10 @@ struct llama_vocab { | |||||||
|     id special_eot_id    = 32010; |     id special_eot_id    = 32010; | ||||||
|  |  | ||||||
|     int find_bpe_rank(std::string token_left, std::string token_right) const { |     int find_bpe_rank(std::string token_left, std::string token_right) const { | ||||||
|         replace_all(token_left,  " ",  "\u0120"); |         GGML_ASSERT(token_left.find(" ") == std::string::npos); | ||||||
|         replace_all(token_left,  "\n", "\u010A"); |         GGML_ASSERT(token_left.find("\n") == std::string::npos); | ||||||
|         replace_all(token_right, " ",  "\u0120"); |         GGML_ASSERT(token_right.find(" ") == std::string::npos); | ||||||
|         replace_all(token_right, "\n", "\u010A"); |         GGML_ASSERT(token_right.find("\n") == std::string::npos); | ||||||
|  |  | ||||||
|         auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); |         auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); | ||||||
|         if (it == bpe_ranks.end()) { |         if (it == bpe_ranks.end()) { | ||||||
| @@ -7499,7 +7500,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c | |||||||
|  |  | ||||||
|     for (size_t i = 0; i < candidates->size; ++i) { |     for (size_t i = 0; i < candidates->size; ++i) { | ||||||
|         const llama_token id    = candidates->data[i].id; |         const llama_token id    = candidates->data[i].id; | ||||||
|         const std::string piece = llama_token_to_str(ctx, id); |         const std::string piece = llama_token_to_piece(ctx, id); | ||||||
|         if (id == eos) { |         if (id == eos) { | ||||||
|             if (!allow_eos) { |             if (!allow_eos) { | ||||||
|                 candidates->data[i].logit = -INFINITY; |                 candidates->data[i].logit = -INFINITY; | ||||||
| @@ -7711,7 +7712,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar | |||||||
|         GGML_ASSERT(false); |         GGML_ASSERT(false); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     const std::string piece = llama_token_to_str(ctx, token); |     const std::string piece = llama_token_to_piece(ctx, token); | ||||||
|  |  | ||||||
|     // Note terminating 0 in decoded string |     // Note terminating 0 in decoded string | ||||||
|     const auto   decoded     = decode_utf8(piece.c_str(), grammar->partial_utf8); |     const auto   decoded     = decode_utf8(piece.c_str(), grammar->partial_utf8); | ||||||
|   | |||||||
							
								
								
									
										
											BIN
										
									
								
								models/ggml-vocab-mpt.gguf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								models/ggml-vocab-mpt.gguf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -31,6 +31,7 @@ llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE | |||||||
| llama_build_executable(test-tokenizer-1-bpe.cpp) | llama_build_executable(test-tokenizer-1-bpe.cpp) | ||||||
| llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) | llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) | ||||||
| llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) | llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) | ||||||
|  | llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) | ||||||
| llama_build_and_test_executable(test-grammar-parser.cpp) | llama_build_and_test_executable(test-grammar-parser.cpp) | ||||||
| llama_build_and_test_executable(test-llama-grammar.cpp) | llama_build_and_test_executable(test-llama-grammar.cpp) | ||||||
| llama_build_and_test_executable(test-grad0.cpp) # SLOW | llama_build_and_test_executable(test-grad0.cpp) # SLOW | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 goerch
					goerch