mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	speculative : do not discard the last drafted token
This commit is contained in:
		| @@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft( | |||||||
|         // add drafted token for each sequence |         // add drafted token for each sequence | ||||||
|         const llama_token id = cur_p->data[0].id; |         const llama_token id = cur_p->data[0].id; | ||||||
|  |  | ||||||
|         // only collect very high-confidence draft tokens |  | ||||||
|         if (cur_p->data[0].p < params.p_min) { |  | ||||||
|             break; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         common_sampler_accept(smpl, id, true); |         common_sampler_accept(smpl, id, true); | ||||||
|  |  | ||||||
|         result.push_back(id); |         result.push_back(id); | ||||||
| @@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft( | |||||||
|             break; |             break; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         // only collect very high-confidence draft tokens | ||||||
|  |         if (cur_p->data[0].p < params.p_min) { | ||||||
|  |             break; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         common_batch_add(batch, id, n_past + i + 1, { 0 }, true); |         common_batch_add(batch, id, n_past + i + 1, { 0 }, true); | ||||||
|  |  | ||||||
|         // evaluate the drafted tokens on the draft model |         // evaluate the drafted tokens on the draft model | ||||||
|   | |||||||
| @@ -274,7 +274,7 @@ struct server_task { | |||||||
|         params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); |         params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); | ||||||
|  |  | ||||||
|         params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); |         params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); | ||||||
|         params.speculative.n_min = std::max(params.speculative.n_min, 2); |         params.speculative.n_min = std::max(params.speculative.n_min, 0); | ||||||
|         params.speculative.n_max = std::max(params.speculative.n_max, 0); |         params.speculative.n_max = std::max(params.speculative.n_max, 0); | ||||||
|  |  | ||||||
|         // Use OpenAI API logprobs only if n_probs wasn't provided |         // Use OpenAI API logprobs only if n_probs wasn't provided | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov