mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	kv-cache : support layer reuse (#15504)
* kv-cache : support layer reuse ggml-ci * cont : update comments [no ci]
This commit is contained in:
		| @@ -9,32 +9,29 @@ | ||||
| // | ||||
|  | ||||
| llama_memory_hybrid::llama_memory_hybrid( | ||||
|     const llama_model & model, | ||||
|                          /* attn */ | ||||
|             ggml_type    type_k, | ||||
|             ggml_type    type_v, | ||||
|                  bool    v_trans, | ||||
|              uint32_t    kv_size, | ||||
|              uint32_t    n_pad, | ||||
|              uint32_t    n_swa, | ||||
|        llama_swa_type    swa_type, | ||||
|                          /* recurrent */ | ||||
|             ggml_type    type_r, | ||||
|             ggml_type    type_s, | ||||
|              uint32_t    rs_size, | ||||
|                          /* common */ | ||||
|              uint32_t    n_seq_max, | ||||
|                  bool    offload, | ||||
|                  bool    unified, | ||||
|                          /* layer filters */ | ||||
|       layer_filter_cb && filter_attn, | ||||
|       layer_filter_cb && filter_recr) : | ||||
|         const llama_model & model, | ||||
|                             /* attn */ | ||||
|                 ggml_type   type_k, | ||||
|                 ggml_type   type_v, | ||||
|                      bool   v_trans, | ||||
|                  uint32_t   kv_size, | ||||
|                  uint32_t   n_pad, | ||||
|                  uint32_t   n_swa, | ||||
|            llama_swa_type   swa_type, | ||||
|                             /* recurrent */ | ||||
|                 ggml_type   type_r, | ||||
|                 ggml_type   type_s, | ||||
|                  uint32_t   rs_size, | ||||
|                             /* common */ | ||||
|                  uint32_t   n_seq_max, | ||||
|                      bool   offload, | ||||
|                      bool   unified, | ||||
|                             /* layer filters */ | ||||
|     const layer_filter_cb & filter_attn, | ||||
|     const layer_filter_cb & filter_recr) : | ||||
|     hparams(model.hparams), | ||||
|     mem_attn(new llama_kv_cache( | ||||
|         model, | ||||
|         filter_attn == nullptr ? | ||||
|             [&](int32_t il) { return !hparams.is_recurrent(il); } | ||||
|             : filter_attn, | ||||
|         type_k, | ||||
|         type_v, | ||||
|         v_trans, | ||||
| @@ -44,18 +41,22 @@ llama_memory_hybrid::llama_memory_hybrid( | ||||
|         n_seq_max, | ||||
|         n_pad, | ||||
|         n_swa, | ||||
|         swa_type | ||||
|         swa_type, | ||||
|         filter_attn == nullptr ? | ||||
|             [&](int32_t il) { return !hparams.is_recurrent(il); } | ||||
|             : filter_attn, | ||||
|         nullptr | ||||
|     )), | ||||
|     mem_recr(new llama_memory_recurrent( | ||||
|         model, | ||||
|         filter_recr == nullptr ? | ||||
|             [&](int32_t il) { return hparams.is_recurrent(il); } | ||||
|             : filter_recr, | ||||
|         type_r, | ||||
|         type_s, | ||||
|         offload, | ||||
|         rs_size, | ||||
|         n_seq_max | ||||
|         n_seq_max, | ||||
|         filter_recr == nullptr ? | ||||
|             [&](int32_t il) { return hparams.is_recurrent(il); } | ||||
|             : filter_recr | ||||
|     )) {} | ||||
|  | ||||
| llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov