restore simple.cpp for now

2025-10-31 08:51:55 +00:00 · 2023-07-15 12:44:47 +02:00
parent 0d2b66c638
commit 5765d7a587
1 changed files with 87 additions and 142 deletions
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,14 +1,46 @@
-#include <stdio.h>
+#ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
-#include "llama.h"
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
 #include <windows.h>
 #include <signal.h>
 #endif
-void generate_sequence(llama_context * ctx, int n_ctx, const std::vector<llama_token>& prompt_tokens, float temperature) {
+
-    // print the tokens from the prompt
+int main(int argc, char ** argv)
-    for (llama_token id : prompt_tokens) {
+{
-        printf("%s", llama_token_to_str(ctx, id));
+    gpt_params params;
    //---------------------------------
    // Print help :
    //---------------------------------
    if ( argc == 1 || argv[1][0] == '-' )
    {
        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
        return 1 ;
    }
    //---------------------------------
@@ -75,164 +107,77 @@ void generate_sequence(llama_context * ctx, int n_ctx, const std::vector<llama_t
    fflush(stdout);
    // the maximum number of tokens to generate at a time
    // TODO: not supported, remove
    const int CUDA_MAX_TOKENS = 1;
    llama_token tokens_out[CUDA_MAX_TOKENS];
-    // current position in the context window
+    //---------------------------------
-    int n_past = 0;
+    // Main prediction loop :
    //---------------------------------
-    // number of tokens to generate
+    // The LLM keeps a contextual cache memory of previous token evaluation.
-    int n_tokens_out;
+    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
-    // list of tokens to evaluate
+    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
-    // note that at most llama_context_params::n_batch tokens can be evaluated at a time
+    {
-    std::vector<llama_token> token_list = prompt_tokens;
+        //---------------------------------
        // Evaluate the tokens :
        //---------------------------------
-    while (n_past < n_ctx) {
+        if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
-        // evaluate the tokens
+        {
-
+            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
-        // llama_eval generates one token at a time
+            return 1;
        n_tokens_out = 1;
        // number of threads to use for CPU evaluation - ignored if compiled with CUDA support
        const int n_threads = 4;
        // note: llama_eval is not compatible with GPU sampling
        if (llama_eval(ctx, token_list.data(), token_list.size(), n_past, n_threads)) {
            fprintf(stderr, "%s : failed to eval\n", __func__ );
            exit(1);
        }
-        // perform sampling on the CPU
+        tokens_list.clear();
-        float * logits  = llama_get_logits(ctx);
+
-        auto n_vocab = llama_n_vocab(ctx);
+        //---------------------------------
        // Select the best prediction :
        //---------------------------------
        llama_token new_token_id = 0;
        auto logits  = llama_get_logits( ctx );
        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
        // initialize candidate array from logits
        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
+        candidates.reserve( n_vocab );
-        for(llama_token token_id = 0 ; token_id < n_vocab ; token_id++) {
+
-            candidates.push_back(llama_token_data{ token_id, logits[token_id], 0.0f});
+        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
        {
            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        // sample token
+        // Select it using the "Greedy sampling" method :
-        llama_sample_temperature(ctx, &candidates_p, temperature);
+        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
        tokens_out[0] = llama_sample_token(ctx, &candidates_p);
        // increment the position in the context window
        n_past += token_list.size() + n_tokens_out - 1;
        token_list.clear();
        // print the new tokens
        for (int i = 0; i < n_tokens_out; i++) {
            llama_token new_token_id = tokens_out[i];
        // is it an end of stream ?
-            if (new_token_id == llama_token_eos()) {
+        if ( new_token_id == llama_token_eos() )
        {
            fprintf(stderr, " [end of text]\n");
-                //return;
+            break;
        }
-            // print the new token :
+        // Print the new token :
-            printf("%s", llama_token_to_str(ctx, new_token_id));
+        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
-        }
+        fflush( stdout );
        fflush(stdout);
-        // push the last new token for the next evaluation
+        // Push this new token for next evaluation :
-        token_list.push_back(tokens_out[n_tokens_out - 1]);
+        tokens_list.push_back( new_token_id );
    }
 }
-int main(int argc, char ** argv) {
+    } // wend of main loop
    if (argc < 2 || argv[1][0] == '-') {
        printf("usage: %s <model> <n_ctx> <n_gens> <temp> [prompt]\n", argv[0]);
        printf(" note: passing a temp parameter will enable GPU sampling\n");
        return 1 ;
    }
-    std::string model = argv[1];
+    llama_free( ctx );
-    struct llama_context_params lparams = llama_context_default_params();
+    llama_free_model( model );
-    if (argc >= 3) {
+    llama_backend_free();
        lparams.n_ctx = std::stoi(argv[2]);
    } else {
        lparams.n_ctx = 512;
    }
    int n_gens;
    if (argc >= 4) {
        n_gens = std::stoi(argv[3]);
    } else {
        n_gens = 1;
    }
    float temperature;
    if (argc >= 5) {
        temperature = std::stof(argv[4]);
    } else {
        temperature = 0.8f;
    }
    std::string prompt;
    if (argc >= 6) {
        prompt = argv[5];
    } else {
        prompt = "Hello my name is";
    }
    // initialize llama.cpp
    bool numa = false;
    llama_init_backend(numa);
    llama_model * lmodel  = llama_load_model_from_file(model.c_str(), lparams);
    if (lmodel == NULL) {
        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, model.c_str());
        return 1;
    }
    llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
    if (ctx == NULL) {
        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, model.c_str());
        llama_free_model(lmodel);
        return 1;
    }
    // tokenize the prompt
    std::vector<llama_token> token_list(lparams.n_ctx);
    int prompt_tokens = llama_tokenize(ctx, prompt.c_str(), token_list.data(), token_list.size(), true);
    if (prompt_tokens <= 0) {
        fprintf(stderr, "%s: error: unable to tokenize prompt\n", __func__);
        return 1;
    }
    token_list.resize(prompt_tokens);
    const int max_context_size     = llama_n_ctx(ctx);
    const int max_tokens_list_size = max_context_size - 4 ;
    if ((int)token_list.size() > max_tokens_list_size) {
        fprintf( stderr, "%s: error: prompt too long (%d tokens, max %d)\n" ,
             __func__, (int)token_list.size(), max_tokens_list_size );
        return 1;
    }
    fprintf(stderr, "\n\n");
    // generate the sequences
    for (int i = 0; i < n_gens; i++) {
        printf("==== GENERATION %d ====\n", i + 1);
        generate_sequence(ctx, max_context_size, token_list, temperature);
        printf("\n\n");
    }
    llama_print_timings(ctx);
    llama_free(ctx);
    llama_backend_free();
    return 0;
 }
 // EOF