mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : fix kv cache management (#3588)
This commit is contained in:
		| @@ -405,6 +405,7 @@ struct llama_server_context | |||||||
|         // compare the evaluated prompt with the new prompt |         // compare the evaluated prompt with the new prompt | ||||||
|         n_past = common_part(embd, prompt_tokens); |         n_past = common_part(embd, prompt_tokens); | ||||||
|         embd = prompt_tokens; |         embd = prompt_tokens; | ||||||
|  |  | ||||||
|         if (n_past == num_prompt_tokens) |         if (n_past == num_prompt_tokens) | ||||||
|         { |         { | ||||||
|             // we have to evaluate at least 1 token to generate logits. |             // we have to evaluate at least 1 token to generate logits. | ||||||
| @@ -412,6 +413,9 @@ struct llama_server_context | |||||||
|             n_past--; |             n_past--; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         // since #3228 we now have to manually manage the KV cache | ||||||
|  |         llama_kv_cache_seq_rm(ctx, 0, n_past, -1); | ||||||
|  |  | ||||||
|         LOG_VERBOSE("prompt ingested", { |         LOG_VERBOSE("prompt ingested", { | ||||||
|                                            {"n_past", n_past}, |                                            {"n_past", n_past}, | ||||||
|                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, |                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, | ||||||
| @@ -461,9 +465,6 @@ struct llama_server_context | |||||||
|         // compare the evaluated prompt with the new prompt |         // compare the evaluated prompt with the new prompt | ||||||
|         n_past = common_part(embd, prompt_tokens); |         n_past = common_part(embd, prompt_tokens); | ||||||
|  |  | ||||||
|         // since #3228 we now have to manually manage the KV cache |  | ||||||
|         llama_kv_cache_seq_rm(ctx, 0, n_past, -1); |  | ||||||
|  |  | ||||||
|         embd = prompt_tokens; |         embd = prompt_tokens; | ||||||
|         if (n_past == num_prompt_tokens) |         if (n_past == num_prompt_tokens) | ||||||
|         { |         { | ||||||
| @@ -471,6 +472,9 @@ struct llama_server_context | |||||||
|             n_past--; |             n_past--; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         // since #3228 we now have to manually manage the KV cache | ||||||
|  |         llama_kv_cache_seq_rm(ctx, 0, n_past, -1); | ||||||
|  |  | ||||||
|         LOG_VERBOSE("prompt ingested", { |         LOG_VERBOSE("prompt ingested", { | ||||||
|                                            {"n_past", n_past}, |                                            {"n_past", n_past}, | ||||||
|                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, |                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)}, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov