parallel : process system prompt once + configurable paramters + llama API

2025-11-01 09:01:57 +00:00 · 2023-09-19 17:00:42 +03:00
parent 82e20e9ba0
commit 4b5f3cd6bf
9 changed files with 187 additions and 93 deletions
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -207,7 +207,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
-        llama_kv_cache_keep_seq(ctx, -1);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@@ -335,7 +335,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
-        llama_kv_cache_keep_seq(ctx, -1);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@@ -568,7 +568,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }

        // clear the KV cache
-        llama_kv_cache_keep_seq(ctx, -1);
+        llama_kv_cache_tokens_rm(ctx, -1, -1);

        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
        if (logits.empty()) {