server : fix kv cache management (#3588)

2025-10-30 08:42:00 +00:00 · 2023-10-12 09:29:04 +03:00
parent b8fe4b5cc9
commit 57dd55e2c7
1 changed files with 7 additions and 3 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -405,6 +405,7 @@ struct llama_server_context
        // compare the evaluated prompt with the new prompt
        n_past = common_part(embd, prompt_tokens);
        embd = prompt_tokens;
        if (n_past == num_prompt_tokens)
        {
            // we have to evaluate at least 1 token to generate logits.
@@ -412,6 +413,9 @@ struct llama_server_context
            n_past--;
        }
        // since #3228 we now have to manually manage the KV cache
        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
        LOG_VERBOSE("prompt ingested", {
                                           {"n_past", n_past},
                                           {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@@ -461,9 +465,6 @@ struct llama_server_context
        // compare the evaluated prompt with the new prompt
        n_past = common_part(embd, prompt_tokens);
        // since #3228 we now have to manually manage the KV cache
        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
        embd = prompt_tokens;
        if (n_past == num_prompt_tokens)
        {
@@ -471,6 +472,9 @@ struct llama_server_context
            n_past--;
        }
        // since #3228 we now have to manually manage the KV cache
        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
        LOG_VERBOSE("prompt ingested", {
                                           {"n_past", n_past},
                                           {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},