mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : refactor llama_context, llama_kv_cache, llm_build_context (#12181)
* llama : refactor llama_context, llama_kv_cache, llm_build_context ggml-ci * graph : don't mutate the KV cache during defrag ggml-ci * context : reduce virtuals + remove test function ggml-ci * context : move interface implementation to source file + factory ggml-ci * graph : move KV cache build functions to llama_context impl ggml-ci * graph : remove model reference from build_pooling ggml-ci * graph : remove llama_model reference ggml-ci * kv_cache : provide rope factors ggml-ci * graph : rework inputs to use only unique_ptr, remove attn input abstraction ggml-ci * context : remove llama_context_i abstraction ggml-ci * context : clean-up ggml-ci * graph : clean-up ggml-ci * llama : remove redundant keywords (struct, enum) ggml-ci * model : adapt gemma3 ggml-ci * graph : restore same attention ops as on master ggml-ci * llama : remove TODO + fix indent ggml-ci
This commit is contained in:
		| @@ -15,11 +15,11 @@ | ||||
| // | ||||
|  | ||||
| struct llama_adapter_cvec { | ||||
|     struct ggml_tensor * tensor_for(int il) const; | ||||
|     ggml_tensor * tensor_for(int il) const; | ||||
|  | ||||
|     struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const; | ||||
|     ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int  il) const; | ||||
|  | ||||
|     int32_t apply( | ||||
|     bool apply( | ||||
|             const llama_model & model, | ||||
|             const float * data, | ||||
|             size_t len, | ||||
| @@ -36,7 +36,7 @@ private: | ||||
|     std::vector<ggml_context_ptr> ctxs; | ||||
|     std::vector<ggml_backend_buffer_ptr> bufs; | ||||
|  | ||||
|     std::vector<struct ggml_tensor *> tensors; // per layer | ||||
|     std::vector<ggml_tensor *> tensors; // per layer | ||||
| }; | ||||
|  | ||||
| // | ||||
| @@ -44,8 +44,8 @@ private: | ||||
| // | ||||
|  | ||||
| struct llama_adapter_lora_weight { | ||||
|     struct ggml_tensor * a = nullptr; | ||||
|     struct ggml_tensor * b = nullptr; | ||||
|     ggml_tensor * a = nullptr; | ||||
|     ggml_tensor * b = nullptr; | ||||
|  | ||||
|     // get actual scale based on rank and alpha | ||||
|     float get_scale(float alpha, float adapter_scale) const { | ||||
| @@ -55,12 +55,12 @@ struct llama_adapter_lora_weight { | ||||
|     } | ||||
|  | ||||
|     llama_adapter_lora_weight() = default; | ||||
|     llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} | ||||
|     llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {} | ||||
| }; | ||||
|  | ||||
| struct llama_adapter_lora { | ||||
|     // map tensor name to lora_a_b | ||||
|     std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map; | ||||
|     std::unordered_map<std::string, llama_adapter_lora_weight> ab_map; | ||||
|  | ||||
|     std::vector<ggml_context_ptr> ctxs; | ||||
|     std::vector<ggml_backend_buffer_ptr> bufs; | ||||
| @@ -70,5 +70,7 @@ struct llama_adapter_lora { | ||||
|     llama_adapter_lora() = default; | ||||
|     ~llama_adapter_lora() = default; | ||||
|  | ||||
|     llama_adapter_lora_weight * get_weight(struct ggml_tensor * w); | ||||
|     llama_adapter_lora_weight * get_weight(ggml_tensor * w); | ||||
| }; | ||||
|  | ||||
| using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov