mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	llama : fix worst case graph build
This commit is contained in:
		@@ -781,7 +781,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 | 
			
		||||
 | 
			
		||||
        std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
 | 
			
		||||
        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0), params.n_threads);
 | 
			
		||||
        llama_kv_cache_keep_seq(lctx, -1);
 | 
			
		||||
        llama_kv_cache_rm_tokens(lctx, -1, -1);
 | 
			
		||||
        llama_reset_timings(lctx);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -977,7 +977,7 @@ int main(int argc, char ** argv) {
 | 
			
		||||
 | 
			
		||||
        test t(inst, lmodel, ctx);
 | 
			
		||||
 | 
			
		||||
        llama_kv_cache_keep_seq(ctx, -1);
 | 
			
		||||
        llama_kv_cache_rm_tokens(ctx, -1, -1);
 | 
			
		||||
 | 
			
		||||
        // warmup run
 | 
			
		||||
        if (t.n_prompt > 0) {
 | 
			
		||||
@@ -988,7 +988,7 @@ int main(int argc, char ** argv) {
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for (int i = 0; i < params.reps; i++) {
 | 
			
		||||
            llama_kv_cache_keep_seq(ctx, -1);
 | 
			
		||||
            llama_kv_cache_rm_tokens(ctx, -1, -1);
 | 
			
		||||
 | 
			
		||||
            uint64_t t_start = get_time_ns();
 | 
			
		||||
            if (t.n_prompt > 0) {
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										175
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										175
									
								
								llama.cpp
									
									
									
									
									
								
							@@ -1024,6 +1024,9 @@ struct llama_kv_cache {
 | 
			
		||||
    uint32_t head = 0;
 | 
			
		||||
    uint32_t size = 0;
 | 
			
		||||
 | 
			
		||||
    // computed before each graph build
 | 
			
		||||
    uint32_t cell_max = 0;
 | 
			
		||||
 | 
			
		||||
    std::vector<llama_kv_cell> cells;
 | 
			
		||||
 | 
			
		||||
    struct ggml_tensor * k = NULL;
 | 
			
		||||
@@ -1314,16 +1317,15 @@ static bool llama_kv_cache_find_slot(
 | 
			
		||||
    return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// find how many cells are currently in use
 | 
			
		||||
int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
 | 
			
		||||
    int32_t res = 0;
 | 
			
		||||
 | 
			
		||||
    for (uint32_t i = 0; i < cache.size; i++) {
 | 
			
		||||
    for (uint32_t i = cache.size - 2; i > 0; --i) {
 | 
			
		||||
        if (cache.cells[i].pos >= 0 && !cache.cells[i].seq_id.empty()) {
 | 
			
		||||
            res = i + 1;
 | 
			
		||||
            return i + 1;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return res;
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void llama_kv_cache_rm_tokens(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
 | 
			
		||||
@@ -2604,12 +2606,13 @@ static struct ggml_cgraph * llm_build_llama(
 | 
			
		||||
    const int n_gpu_layers = model.n_gpu_layers;
 | 
			
		||||
 | 
			
		||||
    const int32_t n_tokens = batch.n_tokens;
 | 
			
		||||
    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : llama_kv_cache_cell_max(kv_self);
 | 
			
		||||
    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
 | 
			
		||||
    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 | 
			
		||||
 | 
			
		||||
    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
 | 
			
		||||
 | 
			
		||||
    //printf("n_kv = %d\n", n_kv);
 | 
			
		||||
 | 
			
		||||
    const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure(lctx.alloc);
 | 
			
		||||
 | 
			
		||||
    auto & buf_compute = lctx.buf_compute;
 | 
			
		||||
 | 
			
		||||
    struct ggml_init_params params = {
 | 
			
		||||
@@ -2713,13 +2716,26 @@ static struct ggml_cgraph * llm_build_llama(
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // K_shift
 | 
			
		||||
    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
 | 
			
		||||
    ggml_allocr_alloc(lctx.alloc, K_shift);
 | 
			
		||||
    if (!ggml_allocr_is_measure(lctx.alloc)) {
 | 
			
		||||
        int * data = (int *) K_shift->data;
 | 
			
		||||
        for (int i = 0; i < n_ctx; ++i) {
 | 
			
		||||
            data[i] = kv_self.cells[i].delta;
 | 
			
		||||
    // shift the entire K-cache if needed
 | 
			
		||||
    if (do_rope_shift) {
 | 
			
		||||
        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
 | 
			
		||||
        ggml_allocr_alloc(lctx.alloc, K_shift);
 | 
			
		||||
        if (!ggml_allocr_is_measure(lctx.alloc)) {
 | 
			
		||||
            int * data = (int *) K_shift->data;
 | 
			
		||||
            for (int i = 0; i < n_ctx; ++i) {
 | 
			
		||||
                data[i] = kv_self.cells[i].delta;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for (int il = 0; il < n_layer; ++il) {
 | 
			
		||||
            ggml_build_forward_expand(gf,
 | 
			
		||||
                    ggml_rope_custom_inplace(ctx0,
 | 
			
		||||
                        ggml_view_3d(ctx0, kv_self.k,
 | 
			
		||||
                            n_embd_head, n_head_kv, n_ctx,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_head,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
 | 
			
		||||
                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -2748,18 +2764,6 @@ static struct ggml_cgraph * llm_build_llama(
 | 
			
		||||
            ggml_set_name(cur, "attention_norm_0");
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // shift the entire K-cache if needed
 | 
			
		||||
        if (do_rope_shift) {
 | 
			
		||||
            ggml_build_forward_expand(gf,
 | 
			
		||||
                    ggml_rope_custom_inplace(ctx0,
 | 
			
		||||
                        ggml_view_3d(ctx0, kv_self.k,
 | 
			
		||||
                            n_embd_head, n_head_kv, n_ctx,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_head,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
 | 
			
		||||
                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // self-attention
 | 
			
		||||
        {
 | 
			
		||||
            // compute Q and K and RoPE them
 | 
			
		||||
@@ -2791,13 +2795,13 @@ static struct ggml_cgraph * llm_build_llama(
 | 
			
		||||
                offload_func_v(Vcur);
 | 
			
		||||
                ggml_set_name(Vcur, "Vcur");
 | 
			
		||||
 | 
			
		||||
                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
 | 
			
		||||
                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
 | 
			
		||||
                offload_func_kq(k);
 | 
			
		||||
                ggml_set_name(k, "k");
 | 
			
		||||
 | 
			
		||||
                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
 | 
			
		||||
                        (   n_ctx)*ggml_element_size(kv_self.v),
 | 
			
		||||
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
 | 
			
		||||
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
 | 
			
		||||
                offload_func_v(v);
 | 
			
		||||
                ggml_set_name(v, "v");
 | 
			
		||||
 | 
			
		||||
@@ -2988,9 +2992,10 @@ static struct ggml_cgraph * llm_build_baichaun(
 | 
			
		||||
    const int n_gpu_layers = model.n_gpu_layers;
 | 
			
		||||
 | 
			
		||||
    const int32_t n_tokens = batch.n_tokens;
 | 
			
		||||
    const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
 | 
			
		||||
    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
 | 
			
		||||
    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 | 
			
		||||
 | 
			
		||||
    const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure(lctx.alloc);
 | 
			
		||||
    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
 | 
			
		||||
 | 
			
		||||
    auto & buf_compute = lctx.buf_compute;
 | 
			
		||||
 | 
			
		||||
@@ -3095,13 +3100,26 @@ static struct ggml_cgraph * llm_build_baichaun(
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // K_shift
 | 
			
		||||
    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
 | 
			
		||||
    ggml_allocr_alloc(lctx.alloc, K_shift);
 | 
			
		||||
    if (!ggml_allocr_is_measure(lctx.alloc)) {
 | 
			
		||||
        int * data = (int *) K_shift->data;
 | 
			
		||||
        for (int i = 0; i < n_ctx; ++i) {
 | 
			
		||||
            data[i] = kv_self.cells[i].delta;
 | 
			
		||||
    // shift the entire K-cache if needed
 | 
			
		||||
    if (do_rope_shift) {
 | 
			
		||||
        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
 | 
			
		||||
        ggml_allocr_alloc(lctx.alloc, K_shift);
 | 
			
		||||
        if (!ggml_allocr_is_measure(lctx.alloc)) {
 | 
			
		||||
            int * data = (int *) K_shift->data;
 | 
			
		||||
            for (int i = 0; i < n_ctx; ++i) {
 | 
			
		||||
                data[i] = kv_self.cells[i].delta;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for (int il = 0; il < n_layer; ++il) {
 | 
			
		||||
            ggml_build_forward_expand(gf,
 | 
			
		||||
                    ggml_rope_custom_inplace(ctx0,
 | 
			
		||||
                        ggml_view_3d(ctx0, kv_self.k,
 | 
			
		||||
                            n_embd_head, n_head_kv, n_ctx,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_head,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
 | 
			
		||||
                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -3130,18 +3148,6 @@ static struct ggml_cgraph * llm_build_baichaun(
 | 
			
		||||
            ggml_set_name(cur, "attention_norm_0");
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // shift the entire K-cache if needed
 | 
			
		||||
        if (do_rope_shift) {
 | 
			
		||||
            ggml_build_forward_expand(gf,
 | 
			
		||||
                    ggml_rope_custom_inplace(ctx0,
 | 
			
		||||
                        ggml_view_3d(ctx0, kv_self.k,
 | 
			
		||||
                            n_embd_head, n_head_kv, n_ctx,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_head,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
 | 
			
		||||
                        K_shift, n_embd_head, 0, 0, freq_base, freq_scale));
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // self-attention
 | 
			
		||||
        {
 | 
			
		||||
            // compute Q and K and RoPE them
 | 
			
		||||
@@ -3186,13 +3192,13 @@ static struct ggml_cgraph * llm_build_baichaun(
 | 
			
		||||
                offload_func_v(Vcur);
 | 
			
		||||
                ggml_set_name(Vcur, "Vcur");
 | 
			
		||||
 | 
			
		||||
                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
 | 
			
		||||
                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
 | 
			
		||||
                offload_func_kq(k);
 | 
			
		||||
                ggml_set_name(k, "k");
 | 
			
		||||
 | 
			
		||||
                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
 | 
			
		||||
                        (   n_ctx)*ggml_element_size(kv_self.v),
 | 
			
		||||
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
 | 
			
		||||
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
 | 
			
		||||
                offload_func_v(v);
 | 
			
		||||
                ggml_set_name(v, "v");
 | 
			
		||||
 | 
			
		||||
@@ -3387,9 +3393,13 @@ static struct ggml_cgraph * llm_build_falcon(
 | 
			
		||||
    const int n_gpu_layers = model.n_gpu_layers;
 | 
			
		||||
 | 
			
		||||
    const int32_t n_tokens = batch.n_tokens;
 | 
			
		||||
    const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
 | 
			
		||||
    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
 | 
			
		||||
    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 | 
			
		||||
 | 
			
		||||
    const bool do_rope_shift = kv_self.has_shift || ggml_allocr_is_measure(lctx.alloc);
 | 
			
		||||
    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
 | 
			
		||||
 | 
			
		||||
    //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n",
 | 
			
		||||
    //        kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift);
 | 
			
		||||
 | 
			
		||||
    auto & buf_compute = lctx.buf_compute;
 | 
			
		||||
 | 
			
		||||
@@ -3494,13 +3504,26 @@ static struct ggml_cgraph * llm_build_falcon(
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // K_shift
 | 
			
		||||
    struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
 | 
			
		||||
    ggml_allocr_alloc(lctx.alloc, K_shift);
 | 
			
		||||
    if (!ggml_allocr_is_measure(lctx.alloc)) {
 | 
			
		||||
        int * data = (int *) K_shift->data;
 | 
			
		||||
        for (int i = 0; i < n_ctx; ++i) {
 | 
			
		||||
            data[i] = kv_self.cells[i].delta;
 | 
			
		||||
    // shift the entire K-cache if needed
 | 
			
		||||
    if (do_rope_shift) {
 | 
			
		||||
        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
 | 
			
		||||
        ggml_allocr_alloc(lctx.alloc, K_shift);
 | 
			
		||||
        if (!ggml_allocr_is_measure(lctx.alloc)) {
 | 
			
		||||
            int * data = (int *) K_shift->data;
 | 
			
		||||
            for (int i = 0; i < n_ctx; ++i) {
 | 
			
		||||
                data[i] = kv_self.cells[i].delta;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for (int il = 0; il < n_layer; ++il) {
 | 
			
		||||
            ggml_build_forward_expand(gf,
 | 
			
		||||
                    ggml_rope_custom_inplace(ctx0,
 | 
			
		||||
                        ggml_view_3d(ctx0, kv_self.k,
 | 
			
		||||
                            n_embd_head, n_head_kv, n_ctx,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_head,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
 | 
			
		||||
                        K_shift, n_embd_head, 2, 0, freq_base, freq_scale));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -3515,18 +3538,6 @@ static struct ggml_cgraph * llm_build_falcon(
 | 
			
		||||
        }
 | 
			
		||||
#endif // GGML_USE_CUBLAS
 | 
			
		||||
 | 
			
		||||
        // shift the entire K-cache if needed
 | 
			
		||||
        if (do_rope_shift) {
 | 
			
		||||
            ggml_build_forward_expand(gf,
 | 
			
		||||
                    ggml_rope_custom_inplace(ctx0,
 | 
			
		||||
                        ggml_view_3d(ctx0, kv_self.k,
 | 
			
		||||
                            n_embd_head, n_head_kv, n_ctx,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_head,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa,
 | 
			
		||||
                            ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
 | 
			
		||||
                        K_shift, n_embd_head, 2, 0, freq_base, freq_scale));
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // self-attention
 | 
			
		||||
        // TODO: refactor into common function (shared with LLaMA)
 | 
			
		||||
        {
 | 
			
		||||
@@ -3603,13 +3614,13 @@ static struct ggml_cgraph * llm_build_falcon(
 | 
			
		||||
                offload_func_v(Vcur->src[0]->src[0]);
 | 
			
		||||
                ggml_set_name(Vcur, "Vcur");
 | 
			
		||||
 | 
			
		||||
                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
 | 
			
		||||
                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
 | 
			
		||||
                offload_func_kq(k);
 | 
			
		||||
                ggml_set_name(k, "k");
 | 
			
		||||
 | 
			
		||||
                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
 | 
			
		||||
                        (   n_ctx)*ggml_element_size(kv_self.v),
 | 
			
		||||
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
 | 
			
		||||
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
 | 
			
		||||
                offload_func_v(v);
 | 
			
		||||
 | 
			
		||||
                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
 | 
			
		||||
@@ -3741,7 +3752,8 @@ static struct ggml_cgraph * llm_build_starcoder(
 | 
			
		||||
    const float norm_eps = hparams.f_norm_eps;
 | 
			
		||||
 | 
			
		||||
    const int32_t n_tokens = batch.n_tokens;
 | 
			
		||||
    const int32_t n_kv     = llama_kv_cache_cell_max(kv_self);
 | 
			
		||||
    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max + n_tokens;
 | 
			
		||||
    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
 | 
			
		||||
 | 
			
		||||
    auto & buf_compute = lctx.buf_compute;
 | 
			
		||||
 | 
			
		||||
@@ -3853,12 +3865,12 @@ static struct ggml_cgraph * llm_build_starcoder(
 | 
			
		||||
                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
 | 
			
		||||
                ggml_set_name(Vcur, "Vcur");
 | 
			
		||||
 | 
			
		||||
                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_self.head));
 | 
			
		||||
                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
 | 
			
		||||
                ggml_set_name(k, "k");
 | 
			
		||||
 | 
			
		||||
                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
 | 
			
		||||
                        (   n_ctx)*ggml_element_size(kv_self.v),
 | 
			
		||||
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_self.head*ggml_element_size(kv_self.v));
 | 
			
		||||
                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
 | 
			
		||||
 | 
			
		||||
                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
 | 
			
		||||
                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
 | 
			
		||||
@@ -4054,8 +4066,15 @@ static bool llama_eval_internal(
 | 
			
		||||
        batch.seq_id = seq_id.data();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // we always start to search for a free slot from the start of the cache
 | 
			
		||||
    // TODO: better strategies can be implemented
 | 
			
		||||
    kv_self.head = 0;
 | 
			
		||||
 | 
			
		||||
    // a heuristic, to avoid attending the full cache if it is not yet utilized
 | 
			
		||||
    // after enough generations, the benefit from this heuristic disappears
 | 
			
		||||
    // if we start defragmenting the cache, the benefit from this will be more important
 | 
			
		||||
    kv_self.cell_max = llama_kv_cache_cell_max(kv_self);
 | 
			
		||||
 | 
			
		||||
    if (!llama_kv_cache_find_slot(kv_self, batch)) {
 | 
			
		||||
        return false;
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user