mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : fix cache reuse logic (#12161)
The first kv shift offsets the positions of all tokens after head_c. When using llama_kv_cache_seq_rm next, using head_c will remove the valid tokens because their positions have already been offset.
This commit is contained in:
		| @@ -3003,7 +3003,7 @@ struct server_context { | ||||
|                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; | ||||
|  | ||||
|                                             llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); | ||||
|                                             llama_kv_cache_seq_add(ctx, slot.id, head_c, -1,     kv_shift); | ||||
|                                             llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift); | ||||
|  | ||||
|                                             for (size_t i = 0; i < n_match; i++) { | ||||
|                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Clauszy
					Clauszy