llama : unified KV cache + batch inference API

2025-10-31 08:51:55 +00:00 · 2023-09-18 10:08:22 +03:00
parent fad56936d4
commit d29e76937c
10 changed files with 315 additions and 236 deletions
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -158,7 +158,8 @@ int main(int argc, char ** argv)
    }
    std::cout << std::flush;

-    int n_past = llama_get_kv_cache_token_count(ctx);
+    int n_past = 0;
+
    if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
    {
        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );