mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : pad KV cache size (#4280)
* llama : pad KV cache size to 32 * metal : try to improve batched decoding
This commit is contained in:
		| @@ -5744,8 +5744,7 @@ static int llama_decode_internal( | ||||
|     // a heuristic, to avoid attending the full cache if it is not yet utilized | ||||
|     // after enough generations, the benefit from this heuristic disappears | ||||
|     // if we start defragmenting the cache, the benefit from this will be more important | ||||
|     //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA? | ||||
|     kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self))); | ||||
|     kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); | ||||
|  | ||||
|     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov