Merge branch 'master' into compilade/refactor-kv-cache

2025-11-06 09:46:50 +00:00 · 2024-09-14 16:08:52 -04:00
parent 4bb4b22a58 822b6322de
commit 63ac36b271
144 changed files with 11344 additions and 6693 deletions
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -1,21 +1,20 @@
+#include "arg.h"
 #include "ggml.h"
-#include "llama.h"
 #include "common.h"
 #include "ngram-cache.h"
+#include "sampling.h"
+#include "llama.h"

-#include <cmath>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
 #include <string>
 #include <vector>
-#include <unordered_map>

 int main(int argc, char ** argv){
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
        return 1;
    }

@@ -106,7 +105,7 @@ int main(int argc, char ** argv){

    bool has_eos = false;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
+    struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);

    std::vector<llama_token> draft;

@@ -130,9 +129,9 @@ int main(int argc, char ** argv){
        int i_dft = 0;
        while (true) {
            // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
+            llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);

-            llama_sampling_accept(ctx_sampling, ctx, id, true);
+            gpt_sampler_accept(smpl, id, true);

            const std::string token_str = llama_token_to_piece(ctx, id);

@@ -241,10 +240,11 @@ int main(int argc, char ** argv){
    LOG_TEE("n_accept     = %d\n", n_accept);
    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

-    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx);
+    LOG_TEE("\ntarget:\n\n");
+    gpt_perf_print(ctx, smpl);
+
+    gpt_sampler_free(smpl);

-    llama_sampling_free(ctx_sampling);
    llama_batch_free(batch_tgt);

    llama_free(ctx);