mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			29 lines
		
	
	
		
			884 B
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			29 lines
		
	
	
		
			884 B
		
	
	
	
		
			C
		
	
	
	
	
	
#pragma once
 | 
						|
 | 
						|
#include "llama.h"
 | 
						|
#include "common.h"
 | 
						|
 | 
						|
struct common_speculative;
 | 
						|
 | 
						|
struct common_speculative_params {
 | 
						|
    int n_draft = 16;  // max drafted tokens
 | 
						|
    int n_reuse = 256;
 | 
						|
 | 
						|
    float p_min = 0.9f; // min probability required to accept a token in the draft
 | 
						|
};
 | 
						|
 | 
						|
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
 | 
						|
 | 
						|
void common_speculative_free(struct common_speculative * spec);
 | 
						|
 | 
						|
bool common_speculative_are_compatible(
 | 
						|
        const struct llama_context * ctx_tgt,
 | 
						|
        const struct llama_context * ctx_dft);
 | 
						|
 | 
						|
// sample up to n_draft tokens and add them to the batch using the draft model
 | 
						|
llama_tokens common_speculative_gen_draft(
 | 
						|
               struct common_speculative * spec,
 | 
						|
        struct common_speculative_params   params,
 | 
						|
                      const llama_tokens & prompt,
 | 
						|
                             llama_token   id_last);
 |