mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	apply various in places
This commit is contained in:
		@@ -91,8 +91,10 @@ int main(int argc, char ** argv){
 | 
			
		||||
 | 
			
		||||
    const auto t_enc_start = ggml_time_us();
 | 
			
		||||
 | 
			
		||||
    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
 | 
			
		||||
    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
 | 
			
		||||
    llama_batch_ext_ptr batch0(llama_batch_ext_init_from_text( inp.data(), n_input - 1, 0, 0));
 | 
			
		||||
    llama_batch_ext_ptr batch1(llama_batch_ext_init_from_text(&inp.back(),           1, 0, 0));
 | 
			
		||||
    llama_decode_ext(ctx, batch0.get());
 | 
			
		||||
    llama_decode_ext(ctx, batch1.get());
 | 
			
		||||
 | 
			
		||||
    const auto t_enc_end = ggml_time_us();
 | 
			
		||||
 | 
			
		||||
@@ -108,7 +110,7 @@ int main(int argc, char ** argv){
 | 
			
		||||
 | 
			
		||||
    std::vector<llama_token> draft;
 | 
			
		||||
 | 
			
		||||
    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
 | 
			
		||||
    llama_batch_ext * batch_tgt = llama_batch_ext_init(params.n_ctx, 1);
 | 
			
		||||
 | 
			
		||||
    // debug
 | 
			
		||||
    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
 | 
			
		||||
@@ -194,8 +196,9 @@ int main(int argc, char ** argv){
 | 
			
		||||
        // clean the cache of draft tokens that weren't accepted
 | 
			
		||||
        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
 | 
			
		||||
 | 
			
		||||
        common_batch_clear(batch_tgt);
 | 
			
		||||
        common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
 | 
			
		||||
        const llama_seq_id seq_id = 0;
 | 
			
		||||
        llama_batch_ext_clear(batch_tgt);
 | 
			
		||||
        llama_batch_ext_add_text(batch_tgt, draft[0], n_past, &seq_id, 1, true);
 | 
			
		||||
 | 
			
		||||
        // Draft already contains a single token sampled from the model:
 | 
			
		||||
        GGML_ASSERT(draft.size() == 1);
 | 
			
		||||
@@ -205,13 +208,13 @@ int main(int argc, char ** argv){
 | 
			
		||||
        common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
 | 
			
		||||
 | 
			
		||||
        for (size_t i = 1; i < draft.size(); ++i) {
 | 
			
		||||
            common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
 | 
			
		||||
            llama_batch_ext_add_text(batch_tgt, draft[i], n_past + i, &seq_id, 1, true);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        t_draft_us += ggml_time_us() - t_start_draft_us;
 | 
			
		||||
        n_drafted += draft.size() - 1;
 | 
			
		||||
 | 
			
		||||
        llama_decode(ctx, batch_tgt);
 | 
			
		||||
        llama_decode_ext(ctx, batch_tgt);
 | 
			
		||||
        ++n_past;
 | 
			
		||||
 | 
			
		||||
        draft.erase(draft.begin());
 | 
			
		||||
@@ -243,7 +246,7 @@ int main(int argc, char ** argv){
 | 
			
		||||
 | 
			
		||||
    common_sampler_free(smpl);
 | 
			
		||||
 | 
			
		||||
    llama_batch_free(batch_tgt);
 | 
			
		||||
    llama_batch_ext_free(batch_tgt);
 | 
			
		||||
 | 
			
		||||
    llama_backend_free();
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user