More efficient Hellaswag implementation (#2677)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2025-10-30 08:42:00 +00:00 · 2023-08-20 16:44:46 +03:00
parent 1f0bccb279
commit 5e9ff54a67
1 changed files with 70 additions and 22 deletions
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -5,6 +5,7 @@
 #include <cmath>
 #include <ctime>
 #include <sstream>
 #include <cstring>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -209,17 +210,19 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    double acc = 0.0f;
    const int n_vocab = llama_n_vocab(ctx);
    std::vector<float> tok_logits(n_vocab);
    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
        // Tokenize the context to count tokens
        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
        size_t context_size = context_embd.size();
-        for (size_t ending_idx=0;ending_idx<4;ending_idx++) {
+        // Do the 1st ending
-
+        // In this case we include the context when evaluating
-            // Tokenize the query
+        auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
-            std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
+        auto query_size = query_embd.size();
-            size_t query_size = query_embd.size();
+        //printf("First query: %d\n",(int)query_size);
        // Stop if query wont fit the ctx window
        if (query_size > (size_t)params.n_ctx) {
@@ -238,19 +241,64 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            return;
        }
-            const auto query_logits = llama_get_logits(ctx);
+        auto query_logits = llama_get_logits(ctx);
            std::vector<float> logits;
            logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
-            hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
+        std::memcpy(tok_logits.data(), query_logits + (context_size-1)*n_vocab, n_vocab*sizeof(float));
-            hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
+        const auto first_probs = softmax(tok_logits);
        hs_data[task_idx].ending_logprob_count[0] = 1;
        hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
        // Calculate the logprobs over the ending
-            for (size_t j = context_size-1; j < query_size - 1; j++) {
+        for (size_t j = context_size; j < query_size - 1; j++) {
-                // Calculate probability of next token, given the previous ones.
+
-                const std::vector<float> tok_logits(
+            std::memcpy(tok_logits.data(), query_logits + j*n_vocab, n_vocab*sizeof(float));
-                    logits.begin() + (j + 0) * n_vocab,
+
-                    logits.begin() + (j + 1) * n_vocab);
+            const float prob = softmax(tok_logits)[query_embd[j + 1]];
            hs_data[task_idx].ending_logprob[0] += std::log(prob);
            hs_data[task_idx].ending_logprob_count[0]++;
        }
        // Calculate the mean token logprob for acc_norm
        hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
        // Do the remaining endings
        // For these, we use the bare ending with n_past = context_size
        //
        for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
            // Tokenize the query
            query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
            query_size = query_embd.size();
            //printf("Second query: %d\n",(int)query_size);
            // Stop if query wont fit the ctx window
            if (context_size + query_size > (size_t)params.n_ctx) {
                fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
                return;
            }
            // Speedup small evaluations by evaluating atleast 32 tokens
            // No, resizing to 32 is actually slightly slower (at least on CUDA)
            //if (query_size < 32) {
            //    query_embd.resize(32);
            //}
            // Evaluate the query
            if (llama_eval(ctx, query_embd.data(), query_embd.size(), context_size, params.n_threads)) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
            }
            query_logits = llama_get_logits(ctx);
            hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
            hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
            // Calculate the logprobs over the ending
            for (size_t j = 0; j < query_size - 1; j++) {
                std::memcpy(tok_logits.data(), query_logits + j*n_vocab, n_vocab*sizeof(float));
                const float prob = softmax(tok_logits)[query_embd[j + 1]];
@@ -267,9 +315,9 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }
        // Find the ending with maximum logprob
-        size_t ending_logprob_max_idx = -1;
+        size_t ending_logprob_max_idx = 0;
-        double ending_logprob_max_val = -INFINITY;
+        double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
-        for (size_t j=0; j < 4; j++) {
+        for (size_t j = 1; j < 4; j++) {
            if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
                ending_logprob_max_idx = j;
                ending_logprob_max_val =  hs_data[task_idx].ending_logprob[j];