mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	speculative : do not discard the last drafted token
This commit is contained in:
		| @@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft( | ||||
|         // add drafted token for each sequence | ||||
|         const llama_token id = cur_p->data[0].id; | ||||
|  | ||||
|         // only collect very high-confidence draft tokens | ||||
|         if (cur_p->data[0].p < params.p_min) { | ||||
|             break; | ||||
|         } | ||||
|  | ||||
|         common_sampler_accept(smpl, id, true); | ||||
|  | ||||
|         result.push_back(id); | ||||
| @@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft( | ||||
|             break; | ||||
|         } | ||||
|  | ||||
|         // only collect very high-confidence draft tokens | ||||
|         if (cur_p->data[0].p < params.p_min) { | ||||
|             break; | ||||
|         } | ||||
|  | ||||
|         common_batch_add(batch, id, n_past + i + 1, { 0 }, true); | ||||
|  | ||||
|         // evaluate the drafted tokens on the draft model | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov