mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-30 08:42:00 +00:00
server : clear the KV cache beyond n_past before llama_decode
This commit is contained in:
@@ -434,6 +434,10 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
n_eval = params.n_batch;
|
n_eval = params.n_batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// since #3228 we now have to manually manage the KV cache
|
||||||
|
llama_kv_cache_tokens_rm(ctx, n_past, -1);
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads))
|
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads))
|
||||||
{
|
{
|
||||||
LOG_ERROR("failed to eval", {
|
LOG_ERROR("failed to eval", {
|
||||||
|
|||||||
Reference in New Issue
Block a user