mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-28 08:31:25 +00:00 
			
		
		
		
	 94933c8c2e
			
		
	
	94933c8c2e
	
	
	
		
			
			* llama-server : implement universal assisted decoding * Erase prompt tail for kv-cache * set vocab_dft_compatible in common_speculative * rename ctx_main to ctx_tgt * move vocab_dft_compatible to spec struct * clear mem_dft, remove mem * detokenize id_last for incompatible models * update comment * add --spec-replace flag * accept special tokens when translating between draft/main models * Escape spec-replace * clamp draft result to size to params.n_draft * fix comment * clean up code * restore old example * log common_speculative_are_compatible in speculative example * fix * Update common/speculative.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/speculative.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/speculative.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
		
			
				
	
	
		
			36 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			36 lines
		
	
	
		
			1.0 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include "llama.h"
 | |
| #include "common.h"
 | |
| 
 | |
| struct common_speculative;
 | |
| 
 | |
| struct common_speculative_params {
 | |
|     int n_draft = 16;  // max drafted tokens
 | |
|     int n_reuse = 256;
 | |
| 
 | |
|     float p_min = 0.75f; // min probability required to accept a token in the draft
 | |
| };
 | |
| 
 | |
| struct common_speculative * common_speculative_init(
 | |
|         struct llama_context * ctx_tgt,
 | |
|         struct llama_context * ctx_dft
 | |
| );
 | |
| 
 | |
| void common_speculative_free(struct common_speculative * spec);
 | |
| 
 | |
| bool common_speculative_are_compatible(
 | |
|         const struct llama_context * ctx_tgt,
 | |
|         const struct llama_context * ctx_dft);
 | |
| 
 | |
| void common_speculative_add_replacement_tgt_dft(
 | |
|         struct common_speculative * spec,
 | |
|         const char *source, const char *dest);
 | |
| 
 | |
| // sample up to n_draft tokens and add them to the batch using the draft model
 | |
| llama_tokens common_speculative_gen_draft(
 | |
|                struct common_speculative * spec,
 | |
|         struct common_speculative_params   params,
 | |
|                       const llama_tokens & prompt,
 | |
|                              llama_token   id_last);
 |