mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	
							
								
								
									
										18
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								llama.cpp
									
									
									
									
									
								
							@@ -2164,8 +2164,7 @@ struct llama_vocab {
 | 
			
		||||
    std::vector<token_data>       id_to_token;
 | 
			
		||||
 | 
			
		||||
    std::vector<id>    cache_special_tokens;
 | 
			
		||||
    std::vector<token> cache_token_to_piece;         // llama_token_to_piece(special = false);
 | 
			
		||||
    std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
 | 
			
		||||
    std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
 | 
			
		||||
 | 
			
		||||
    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
 | 
			
		||||
 | 
			
		||||
@@ -4845,23 +4844,19 @@ static void llm_load_vocab(
 | 
			
		||||
        LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // build token to piece caches
 | 
			
		||||
    // build token to piece cache
 | 
			
		||||
    {
 | 
			
		||||
        size_t size_cache = 0;
 | 
			
		||||
 | 
			
		||||
        std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
 | 
			
		||||
        std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
 | 
			
		||||
 | 
			
		||||
        for (uint32_t id = 0; id < n_vocab; ++id) {
 | 
			
		||||
            cache_token_to_piece[id]         = llama_token_to_piece(&model, id, false);
 | 
			
		||||
            cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
 | 
			
		||||
            cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
 | 
			
		||||
 | 
			
		||||
            size_cache += cache_token_to_piece[id].size();
 | 
			
		||||
            size_cache += cache_token_to_piece_special[id].size();
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
 | 
			
		||||
        std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
 | 
			
		||||
 | 
			
		||||
        LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
 | 
			
		||||
    }
 | 
			
		||||
@@ -18318,9 +18313,14 @@ static std::string llama_decode_text(const std::string & text) {
 | 
			
		||||
 | 
			
		||||
// does not write null-terminator to buf
 | 
			
		||||
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
 | 
			
		||||
    // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
 | 
			
		||||
    if (!special && llama_is_control_token(model->vocab, token)) {
 | 
			
		||||
        return 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // if we have a cache - use it
 | 
			
		||||
    {
 | 
			
		||||
        const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
 | 
			
		||||
        const auto & cache = model->vocab.cache_token_to_piece;
 | 
			
		||||
 | 
			
		||||
        if (!cache.empty()) {
 | 
			
		||||
            const auto & res = cache.at(token);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user