mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	llama : store non-RoPEd K cache (WIP)
This commit is contained in:
		
							
								
								
									
										42
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										42
									
								
								llama.cpp
									
									
									
									
									
								
							@@ -2428,16 +2428,25 @@ static struct ggml_cgraph * llm_build_llama(
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // KQ_pos - contains the positions
 | 
					    // Q_pos - contains the positions
 | 
				
			||||||
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
 | 
					    struct ggml_tensor * Q_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
 | 
				
			||||||
    ggml_allocr_alloc(lctx.alloc, KQ_pos);
 | 
					    ggml_allocr_alloc(lctx.alloc, Q_pos);
 | 
				
			||||||
    if (!ggml_allocr_is_measure(lctx.alloc)) {
 | 
					    if (!ggml_allocr_is_measure(lctx.alloc)) {
 | 
				
			||||||
        int * data = (int *) KQ_pos->data;
 | 
					        int * data = (int *) Q_pos->data;
 | 
				
			||||||
        for (int i = 0; i < N; ++i) {
 | 
					        for (int i = 0; i < N; ++i) {
 | 
				
			||||||
            data[i] = n_past + i;
 | 
					            data[i] = n_past + i;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    struct ggml_tensor * K_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_past + N);
 | 
				
			||||||
 | 
					    ggml_allocr_alloc(lctx.alloc, K_pos);
 | 
				
			||||||
 | 
					    if (!ggml_allocr_is_measure(lctx.alloc)) {
 | 
				
			||||||
 | 
					        int * data = (int *) K_pos->data;
 | 
				
			||||||
 | 
					        for (int i = 0; i < n_past + N; ++i) {
 | 
				
			||||||
 | 
					            data[i] = i;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int il = 0; il < n_layer; ++il) {
 | 
					    for (int il = 0; il < n_layer; ++il) {
 | 
				
			||||||
        ggml_format_name(inpL, "layer_inp_%d", il);
 | 
					        ggml_format_name(inpL, "layer_inp_%d", il);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -2474,14 +2483,18 @@ static struct ggml_cgraph * llm_build_llama(
 | 
				
			|||||||
            offload_func_kq(tmpq);
 | 
					            offload_func_kq(tmpq);
 | 
				
			||||||
            ggml_set_name(tmpq, "tmpq");
 | 
					            ggml_set_name(tmpq, "tmpq");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
 | 
					            // Note: we are not RoPE-ing K here
 | 
				
			||||||
 | 
					            struct ggml_tensor * Kcur = tmpk;
 | 
				
			||||||
            offload_func_kq(Kcur);
 | 
					            offload_func_kq(Kcur);
 | 
				
			||||||
            ggml_set_name(Kcur, "Kcur");
 | 
					            ggml_set_name(Kcur, "Kcur");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
 | 
					            struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    Q_pos, n_embd_head, 0, 0, freq_base, freq_scale);
 | 
				
			||||||
            offload_func_kq(Qcur);
 | 
					            offload_func_kq(Qcur);
 | 
				
			||||||
            ggml_set_name(Qcur, "Qcur");
 | 
					            ggml_set_name(Qcur, "Qcur");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            struct ggml_tensor * ck;
 | 
				
			||||||
 | 
					            struct ggml_tensor * cv;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            // store key and value to memory
 | 
					            // store key and value to memory
 | 
				
			||||||
            {
 | 
					            {
 | 
				
			||||||
                // compute the transposed [N, n_embd] V matrix
 | 
					                // compute the transposed [N, n_embd] V matrix
 | 
				
			||||||
@@ -2504,9 +2517,11 @@ static struct ggml_cgraph * llm_build_llama(
 | 
				
			|||||||
                offload_func_v(v);
 | 
					                offload_func_v(v);
 | 
				
			||||||
                ggml_set_name(v, "v");
 | 
					                ggml_set_name(v, "v");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                // important: storing RoPE-ed version of K in the KV cache!
 | 
					                ck = ggml_cpy(ctx0, Kcur, k);
 | 
				
			||||||
                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
 | 
					                cv = ggml_cpy(ctx0, Vcur, v);
 | 
				
			||||||
                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
 | 
					
 | 
				
			||||||
 | 
					                ggml_build_forward_expand(gf, ck);
 | 
				
			||||||
 | 
					                ggml_build_forward_expand(gf, cv);
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
 | 
					            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
 | 
				
			||||||
@@ -2515,13 +2530,18 @@ static struct ggml_cgraph * llm_build_llama(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            struct ggml_tensor * K =
 | 
					            struct ggml_tensor * K =
 | 
				
			||||||
                ggml_view_3d(ctx0, kv_self.k,
 | 
					                ggml_view_3d(ctx0, kv_self.k,
 | 
				
			||||||
                        n_embd_head, n_past + N, n_head_kv,
 | 
					                        n_embd_head, n_head_kv, n_past + N,
 | 
				
			||||||
                        ggml_element_size(kv_self.k)*n_embd_gqa,
 | 
					 | 
				
			||||||
                        ggml_element_size(kv_self.k)*n_embd_head,
 | 
					                        ggml_element_size(kv_self.k)*n_embd_head,
 | 
				
			||||||
 | 
					                        ggml_element_size(kv_self.k)*n_embd_gqa,
 | 
				
			||||||
                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
 | 
					                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
 | 
				
			||||||
            offload_func_kq(K);
 | 
					            offload_func_kq(K);
 | 
				
			||||||
            ggml_set_name(K, "K");
 | 
					            ggml_set_name(K, "K");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            // RoPE the K cache
 | 
				
			||||||
 | 
					            K->src[1] = ck; // TODO: HACK!!
 | 
				
			||||||
 | 
					            K = ggml_rope_custom(ctx0, K, K_pos, n_embd_head, 0, 0, freq_base, freq_scale);
 | 
				
			||||||
 | 
					            K = ggml_permute(ctx0, K, 0, 2, 1, 3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            // K * Q
 | 
					            // K * Q
 | 
				
			||||||
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 | 
					            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 | 
				
			||||||
            offload_func_kq(KQ);
 | 
					            offload_func_kq(KQ);
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user