mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	Merge branch 'master' into compilade/refactor-kv-cache
This commit is contained in:
		@@ -255,7 +255,9 @@ int main(int argc, char ** argv) {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const bool add_bos = llama_should_add_bos_token(model);
 | 
			
		||||
    GGML_ASSERT(llama_add_eos_token(model) != 1);
 | 
			
		||||
    if (!llama_model_has_encoder(model)) {
 | 
			
		||||
        GGML_ASSERT(llama_add_eos_token(model) != 1);
 | 
			
		||||
    }
 | 
			
		||||
    LOG("add_bos: %d\n", add_bos);
 | 
			
		||||
 | 
			
		||||
    std::vector<llama_token> embd_inp;
 | 
			
		||||
@@ -520,6 +522,24 @@ int main(int argc, char ** argv) {
 | 
			
		||||
        exit(1);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (llama_model_has_encoder(model)) {
 | 
			
		||||
        int enc_input_size = embd_inp.size();
 | 
			
		||||
        llama_token * enc_input_buf = embd_inp.data();
 | 
			
		||||
 | 
			
		||||
        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
 | 
			
		||||
            LOG_TEE("%s : failed to eval\n", __func__);
 | 
			
		||||
            return 1;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
 | 
			
		||||
        if (decoder_start_token_id == -1) {
 | 
			
		||||
            decoder_start_token_id = llama_token_bos(model);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        embd_inp.clear();
 | 
			
		||||
        embd_inp.push_back(decoder_start_token_id);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
 | 
			
		||||
        // predict
 | 
			
		||||
        if (!embd.empty()) {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user