mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	kv-cache : fix out-of-bounds view during reserve graph (#13547)
* kv-cache : fix reserve graph out-of-bounds access ggml-ci * cont : add comment * cont : fix comments [no ci] * cont : more correct comment [no ci]
This commit is contained in:
		| @@ -441,6 +441,13 @@ void llama_kv_cache_unified::defrag_sched(float thold) { | ||||
|  | ||||
| void llama_kv_cache_unified::set_full() { | ||||
|     n = size; | ||||
|  | ||||
|     // when simulating a full KV cache, the specific value of the "head" pointer is not important because it does not | ||||
|     //   affect the shapes of the tensors in the compute graph - it only affects the offsets of the K/V views. | ||||
|     //   we should only guarantee that the head position won't cause out-of-bounds view of the K, V tensors, so | ||||
|     //   setting it to 0 is the simplest way to achieve that | ||||
|     // ref: https://github.com/ggml-org/llama.cpp/issues/13359 | ||||
|     head = 0; | ||||
| } | ||||
|  | ||||
| llama_sbatch llama_kv_cache_unified::sbatch_init( | ||||
| @@ -1712,6 +1719,7 @@ void llama_kv_cache_recurrent::defrag_sched(float thold) { | ||||
|  | ||||
| void llama_kv_cache_recurrent::set_full() { | ||||
|     n = size; | ||||
|     head = 0; | ||||
| } | ||||
|  | ||||
| llama_sbatch llama_kv_cache_recurrent::sbatch_init( | ||||
|   | ||||
| @@ -171,11 +171,8 @@ public: | ||||
|     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; | ||||
|     void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override; | ||||
|  | ||||
|     // Note: The value of head isn't only used to optimize searching | ||||
|     // for a free KV slot. llama_decode_impl also uses it, so it | ||||
|     // cannot be freely changed after a slot has been allocated. | ||||
|     uint32_t head = 0; | ||||
|     uint32_t size = 0; | ||||
|     uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot()) | ||||
|     uint32_t size = 0; // total number of cells, shared across all sequences | ||||
|     uint32_t used = 0; // used cells (i.e. at least one seq_id) | ||||
|  | ||||
|     // computed before each graph build | ||||
| @@ -343,11 +340,8 @@ public: | ||||
|     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override; | ||||
|     void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override; | ||||
|  | ||||
|     // Note: The value of head isn't only used to optimize searching | ||||
|     // for a free KV slot. llama_decode_impl also uses it, so it | ||||
|     // cannot be freely changed after a slot has been allocated. | ||||
|     uint32_t head = 0; | ||||
|     uint32_t size = 0; | ||||
|     uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot()) | ||||
|     uint32_t size = 0; // total number of cells, shared across all sequences | ||||
|     uint32_t used = 0; // used cells (i.e. at least one seq_id) | ||||
|  | ||||
|     // computed before each graph build | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov