llama : unified KV cache + batch inference API

2025-11-05 09:36:52 +00:00 · 2023-09-18 10:08:22 +03:00
parent fad56936d4
commit d29e76937c
10 changed files with 315 additions and 236 deletions
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -400,7 +400,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
    return {tokens, ppl, logit_history, prob_history};
 }

-std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
+std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int> & tokens, int n_past, int n_batch,
        int n_vocab, int n_thread) {
    std::vector<float> result;
    result.reserve(tokens.size() * n_vocab);