mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	 abd4d0bc4f
			
		
	
	abd4d0bc4f
	
	
	
		
			
			* speculative : update default params * speculative : do not discard the last drafted token
		
			
				
	
	
		
			29 lines
		
	
	
		
			885 B
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			29 lines
		
	
	
		
			885 B
		
	
	
	
		
			C
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include "llama.h"
 | |
| #include "common.h"
 | |
| 
 | |
| struct common_speculative;
 | |
| 
 | |
| struct common_speculative_params {
 | |
|     int n_draft = 16;  // max drafted tokens
 | |
|     int n_reuse = 256;
 | |
| 
 | |
|     float p_min = 0.75f; // min probability required to accept a token in the draft
 | |
| };
 | |
| 
 | |
| struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
 | |
| 
 | |
| void common_speculative_free(struct common_speculative * spec);
 | |
| 
 | |
| bool common_speculative_are_compatible(
 | |
|         const struct llama_context * ctx_tgt,
 | |
|         const struct llama_context * ctx_dft);
 | |
| 
 | |
| // sample up to n_draft tokens and add them to the batch using the draft model
 | |
| llama_tokens common_speculative_gen_draft(
 | |
|                struct common_speculative * spec,
 | |
|         struct common_speculative_params   params,
 | |
|                       const llama_tokens & prompt,
 | |
|                              llama_token   id_last);
 |