mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	llama : pad KV cache size (#4280)
* llama : pad KV cache size to 32 * metal : try to improve batched decoding
This commit is contained in:
		| @@ -1083,7 +1083,7 @@ void ggml_metal_graph_compute( | |||||||
|  |  | ||||||
|                             // find the break-even point where the matrix-matrix kernel becomes more efficient compared |                             // find the break-even point where the matrix-matrix kernel becomes more efficient compared | ||||||
|                             // to the matrix-vector kernel |                             // to the matrix-vector kernel | ||||||
|                             int ne11_mm_min = 1; |                             int ne11_mm_min = src0t == GGML_TYPE_F16 ? 1 : 16; | ||||||
|  |  | ||||||
| #if 0 | #if 0 | ||||||
|                             // the numbers below are measured on M2 Ultra for 7B and 13B models |                             // the numbers below are measured on M2 Ultra for 7B and 13B models | ||||||
|   | |||||||
| @@ -5744,8 +5744,7 @@ static int llama_decode_internal( | |||||||
|     // a heuristic, to avoid attending the full cache if it is not yet utilized |     // a heuristic, to avoid attending the full cache if it is not yet utilized | ||||||
|     // after enough generations, the benefit from this heuristic disappears |     // after enough generations, the benefit from this heuristic disappears | ||||||
|     // if we start defragmenting the cache, the benefit from this will be more important |     // if we start defragmenting the cache, the benefit from this will be more important | ||||||
|     //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA? |     kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); | ||||||
|     kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self))); |  | ||||||
|  |  | ||||||
|     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); |     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov