mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	llama : refactor llama_context, llama_kv_cache, llm_build_context (#12181)
* llama : refactor llama_context, llama_kv_cache, llm_build_context ggml-ci * graph : don't mutate the KV cache during defrag ggml-ci * context : reduce virtuals + remove test function ggml-ci * context : move interface implementation to source file + factory ggml-ci * graph : move KV cache build functions to llama_context impl ggml-ci * graph : remove model reference from build_pooling ggml-ci * graph : remove llama_model reference ggml-ci * kv_cache : provide rope factors ggml-ci * graph : rework inputs to use only unique_ptr, remove attn input abstraction ggml-ci * context : remove llama_context_i abstraction ggml-ci * context : clean-up ggml-ci * graph : clean-up ggml-ci * llama : remove redundant keywords (struct, enum) ggml-ci * model : adapt gemma3 ggml-ci * graph : restore same attention ops as on master ggml-ci * llama : remove TODO + fix indent ggml-ci
This commit is contained in:
		| @@ -202,7 +202,7 @@ int main(int argc, char ** argv) { | ||||
|  | ||||
|         // assign the system KV cache to all parallel sequences | ||||
|         for (int32_t i = 1; i <= n_clients; ++i) { | ||||
|             llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); | ||||
|             llama_kv_self_seq_cp(ctx, 0, i, -1, -1); | ||||
|         } | ||||
|  | ||||
|         LOG_INF("\n"); | ||||
| @@ -234,9 +234,9 @@ int main(int argc, char ** argv) { | ||||
|         if (batch.n_tokens == 0) { | ||||
|             // all sequences have ended - clear the entire KV cache | ||||
|             for (int i = 1; i <= n_clients; ++i) { | ||||
|                 llama_kv_cache_seq_rm(ctx, i, -1, -1); | ||||
|                 llama_kv_self_seq_rm(ctx, i, -1, -1); | ||||
|                 // but keep the system prompt | ||||
|                 llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); | ||||
|                 llama_kv_self_seq_cp(ctx, 0, i, -1, -1); | ||||
|             } | ||||
|  | ||||
|             LOG_INF("%s: clearing the KV cache\n", __func__); | ||||
| @@ -372,8 +372,8 @@ int main(int argc, char ** argv) { | ||||
|                     } | ||||
|  | ||||
|                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache | ||||
|                     llama_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1); | ||||
|                     llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1); | ||||
|                     llama_kv_self_seq_rm(ctx,    client.id + 1, -1, -1); | ||||
|                     llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1); | ||||
|  | ||||
|                     const auto t_main_end = ggml_time_us(); | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov