mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	 d9d54e498d
			
		
	
	d9d54e498d
	
	
	
		
			
			* speculative : refactor and add a simpler example ggml-ci * speculative : clean-up and add comments and TODOs [no ci] * speculative : manage context in common_speculative ggml-ci * speculative : simplify ggml-ci * speculative : simplify (cont) ggml-ci * speculative : add --draft-min CLI arg * speculative : minor fixup * make : build fixes * speculative : do not redraft previous drafts ggml-ci * speculative : fix the draft sampling ggml-ci * speculative : fix compile warning * common : refactor args ggml-ci * common : change defaults [no ci] * common : final touches ggml-ci
		
			
				
	
	
		
			29 lines
		
	
	
		
			883 B
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			29 lines
		
	
	
		
			883 B
		
	
	
	
		
			C
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include "llama.h"
 | |
| #include "common.h"
 | |
| 
 | |
| struct common_speculative;
 | |
| 
 | |
| struct common_speculative_params {
 | |
|     int n_draft = 16;  // max drafted tokens
 | |
|     int n_reuse = 256;
 | |
| 
 | |
|     float p_min = 0.9f; // min probabiliy required to accept a token in the draft
 | |
| };
 | |
| 
 | |
| struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
 | |
| 
 | |
| void common_speculative_free(struct common_speculative * spec);
 | |
| 
 | |
| bool common_speculative_are_compatible(
 | |
|         const struct llama_context * ctx_tgt,
 | |
|         const struct llama_context * ctx_dft);
 | |
| 
 | |
| // sample up to n_draft tokens and add them to the batch using the draft model
 | |
| llama_tokens common_speculative_gen_draft(
 | |
|                struct common_speculative * spec,
 | |
|         struct common_speculative_params   params,
 | |
|                       const llama_tokens & prompt,
 | |
|                              llama_token   id_last);
 |