mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	llama : optimize defrag moves + fix fragmentation calculation (#6037)
* attempt to reduce the impact of a worst-case scenario * fragmentation calculation fix * Update llama.cpp --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
				
					committed by
					
						
						GitHub
					
				
			
			
				
	
			
			
			
						parent
						
							3ca23481dd
						
					
				
				
					commit
					2c4fb69246
				
			
							
								
								
									
										30
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										30
									
								
								llama.cpp
									
									
									
									
									
								
							@@ -9036,8 +9036,8 @@ static int llama_decode_internal(
 | 
				
			|||||||
    //llama_synchronize(&lctx);
 | 
					    //llama_synchronize(&lctx);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // decide if we need to defrag the kv cache
 | 
					    // decide if we need to defrag the kv cache
 | 
				
			||||||
    if (cparams.defrag_thold >= 0.0f) {
 | 
					    if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
 | 
				
			||||||
        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f;
 | 
					        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // queue defragmentation for next llama_kv_cache_update
 | 
					        // queue defragmentation for next llama_kv_cache_update
 | 
				
			||||||
        if (fragmentation > cparams.defrag_thold) {
 | 
					        if (fragmentation > cparams.defrag_thold) {
 | 
				
			||||||
@@ -9069,6 +9069,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 | 
				
			|||||||
    // number of cells moved
 | 
					    // number of cells moved
 | 
				
			||||||
    uint32_t n_moves = 0;
 | 
					    uint32_t n_moves = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // each move requires 6*n_layer tensors (see build_defrag)
 | 
				
			||||||
 | 
					    //   - source view, destination view, copy operation
 | 
				
			||||||
 | 
					    //   - x2 for keys and values
 | 
				
			||||||
 | 
					    const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // determine which KV cells to move where
 | 
					    // determine which KV cells to move where
 | 
				
			||||||
    //
 | 
					    //
 | 
				
			||||||
    //  cell i moves to ids[i]
 | 
					    //  cell i moves to ids[i]
 | 
				
			||||||
@@ -9095,15 +9100,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 | 
				
			|||||||
            nh++;
 | 
					            nh++;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // each move requires 6*n_layer tensors (see build_defrag)
 | 
					 | 
				
			||||||
        //   - source view, destination view, copy operation
 | 
					 | 
				
			||||||
        //   - x2 for keys and values
 | 
					 | 
				
			||||||
        //
 | 
					 | 
				
			||||||
        if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
 | 
					 | 
				
			||||||
            // the graph is too big, we cannot move more cells
 | 
					 | 
				
			||||||
            break;
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        uint32_t nf = 0;
 | 
					        uint32_t nf = 0;
 | 
				
			||||||
        uint32_t is = n_kv - 1;
 | 
					        uint32_t is = n_kv - 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -9133,11 +9129,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 | 
				
			|||||||
        // are we moving a continuous block of memory?
 | 
					        // are we moving a continuous block of memory?
 | 
				
			||||||
        bool cont = false;
 | 
					        bool cont = false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // should we stop searching for the next move?
 | 
				
			||||||
 | 
					        bool stop = false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // go back and move the nf cells to the hole
 | 
					        // go back and move the nf cells to the hole
 | 
				
			||||||
        for (; i1 < n_kv; ++i1) {
 | 
					        for (; i1 < n_kv; ++i1) {
 | 
				
			||||||
            auto & cell1 = kv_self.cells[i1];
 | 
					            auto & cell1 = kv_self.cells[i1];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if (cell1.is_empty() || ids[i1] != n_kv) {
 | 
					            if (cell1.is_empty() || ids[i1] != n_kv) {
 | 
				
			||||||
 | 
					                if (n_moves == max_moves) {
 | 
				
			||||||
 | 
					                    stop = true;
 | 
				
			||||||
 | 
					                    break;
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                cont = false;
 | 
					                cont = false;
 | 
				
			||||||
                continue;
 | 
					                continue;
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
@@ -9164,6 +9168,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 | 
				
			|||||||
            }
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if (stop || n_moves == max_moves) {
 | 
				
			||||||
 | 
					            break;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
 | 
					        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        i0 += nh - 1;
 | 
					        i0 += nh - 1;
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user