mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	speculative : update default params
This commit is contained in:
		| @@ -178,10 +178,10 @@ struct common_params_speculative { | ||||
|  | ||||
|     int32_t n_ctx        =     0; // draft context size | ||||
|     int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding | ||||
|     int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding | ||||
|     int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding | ||||
|     int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default) | ||||
|     float   p_split      =  0.1f; // speculative decoding split probability | ||||
|     float   p_min        =  0.9f; // minimum speculative decoding probability (greedy) | ||||
|     float   p_min        = 0.75f; // minimum speculative decoding probability (greedy) | ||||
|  | ||||
|     struct cpu_params cpuparams; | ||||
|     struct cpu_params cpuparams_batch; | ||||
|   | ||||
| @@ -9,7 +9,7 @@ struct common_speculative_params { | ||||
|     int n_draft = 16;  // max drafted tokens | ||||
|     int n_reuse = 256; | ||||
|  | ||||
|     float p_min = 0.9f; // min probability required to accept a token in the draft | ||||
|     float p_min = 0.75f; // min probability required to accept a token in the draft | ||||
| }; | ||||
|  | ||||
| struct common_speculative * common_speculative_init(struct llama_context * ctx_dft); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov