mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	simple : minor style changes
This commit is contained in:
		@@ -2,17 +2,18 @@
 | 
			
		||||
 | 
			
		||||
import gguf
 | 
			
		||||
import gguf_namemap as tmap
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import sys
 | 
			
		||||
import struct
 | 
			
		||||
import json
 | 
			
		||||
import numpy as np
 | 
			
		||||
import torch
 | 
			
		||||
 | 
			
		||||
from typing import Any, List
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
import torch
 | 
			
		||||
from sentencepiece import SentencePieceProcessor
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#NDArray = np.ndarray[Any, Any]
 | 
			
		||||
# compatible with python < 3.9
 | 
			
		||||
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
 | 
			
		||||
@@ -225,7 +226,7 @@ for part_name in part_names:
 | 
			
		||||
            sys.exit()
 | 
			
		||||
 | 
			
		||||
        n_dims = len(data.shape)
 | 
			
		||||
        data_dtype = data.dtype 
 | 
			
		||||
        data_dtype = data.dtype
 | 
			
		||||
 | 
			
		||||
        # if f32 desired, convert any float16 to float32
 | 
			
		||||
        if ftype == 0 and data.dtype == np.float16:
 | 
			
		||||
@@ -268,7 +269,6 @@ for part_name in part_names:
 | 
			
		||||
    for name in model_part.keys():
 | 
			
		||||
        data = model_part[name]
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
        old_dtype = data.dtype
 | 
			
		||||
 | 
			
		||||
        # we don't need these
 | 
			
		||||
@@ -295,7 +295,7 @@ for part_name in part_names:
 | 
			
		||||
            sys.exit()
 | 
			
		||||
 | 
			
		||||
        n_dims = len(data.shape)
 | 
			
		||||
        data_dtype = data.dtype 
 | 
			
		||||
        data_dtype = data.dtype
 | 
			
		||||
 | 
			
		||||
        # if f32 desired, convert any float16 to float32
 | 
			
		||||
        if ftype == 0 and data.dtype == np.float16:
 | 
			
		||||
 
 | 
			
		||||
@@ -6,177 +6,121 @@
 | 
			
		||||
#include "gguf-llama.h"
 | 
			
		||||
#include "build-info.h"
 | 
			
		||||
 | 
			
		||||
#include <cassert>
 | 
			
		||||
#include <cinttypes>
 | 
			
		||||
#include <cmath>
 | 
			
		||||
#include <cstdio>
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <ctime>
 | 
			
		||||
#include <fstream>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <string>
 | 
			
		||||
#include <vector>
 | 
			
		||||
 | 
			
		||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 | 
			
		||||
#include <signal.h>
 | 
			
		||||
#include <unistd.h>
 | 
			
		||||
#elif defined (_WIN32)
 | 
			
		||||
#define WIN32_LEAN_AND_MEAN
 | 
			
		||||
#define NOMINMAX
 | 
			
		||||
#include <windows.h>
 | 
			
		||||
#include <signal.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main(int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
int main(int argc, char ** argv) {
 | 
			
		||||
    gpt_params params;
 | 
			
		||||
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // Print help :
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
 | 
			
		||||
    if ( argc == 1 || argv[1][0] == '-' )
 | 
			
		||||
    {
 | 
			
		||||
        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
 | 
			
		||||
    if (argc == 1 || argv[1][0] == '-') {
 | 
			
		||||
        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
 | 
			
		||||
        return 1 ;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // Load parameters :
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
 | 
			
		||||
    if ( argc >= 2 )
 | 
			
		||||
    {
 | 
			
		||||
    if (argc >= 2) {
 | 
			
		||||
        params.model = argv[1];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if ( argc >= 3 )
 | 
			
		||||
    {
 | 
			
		||||
    if (argc >= 3) {
 | 
			
		||||
        params.prompt = argv[2];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if ( params.prompt.empty() )
 | 
			
		||||
    {
 | 
			
		||||
    if (params.prompt.empty()) {
 | 
			
		||||
        params.prompt = "Hello my name is";
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // Init LLM :
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // init LLM
 | 
			
		||||
 | 
			
		||||
    llama_backend_init(params.numa);
 | 
			
		||||
 | 
			
		||||
    llama_context_params ctx_params = llama_context_default_params();
 | 
			
		||||
 | 
			
		||||
    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
 | 
			
		||||
    
 | 
			
		||||
    if ( model == NULL )
 | 
			
		||||
    {
 | 
			
		||||
        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
 | 
			
		||||
 | 
			
		||||
    if (model == NULL) {
 | 
			
		||||
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
 | 
			
		||||
        return 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 | 
			
		||||
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // Tokenize the prompt :
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // tokenize the prompt
 | 
			
		||||
 | 
			
		||||
    std::vector<llama_token> tokens_list;
 | 
			
		||||
    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
 | 
			
		||||
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
 | 
			
		||||
 | 
			
		||||
    const int max_context_size     = llama_n_ctx( ctx );
 | 
			
		||||
    const int max_tokens_list_size = max_context_size - 4 ;
 | 
			
		||||
    const int max_context_size     = llama_n_ctx(ctx);
 | 
			
		||||
    const int max_tokens_list_size = max_context_size - 4;
 | 
			
		||||
 | 
			
		||||
    if ( (int)tokens_list.size() > max_tokens_list_size )
 | 
			
		||||
    {
 | 
			
		||||
        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
 | 
			
		||||
             __func__ , (int)tokens_list.size() , max_tokens_list_size );
 | 
			
		||||
    if ((int)tokens_list.size() > max_tokens_list_size) {
 | 
			
		||||
        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
 | 
			
		||||
        return 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fprintf( stderr, "\n\n" );
 | 
			
		||||
    fprintf(stderr, "\n\n");
 | 
			
		||||
 | 
			
		||||
    // Print the tokens from the prompt :
 | 
			
		||||
 | 
			
		||||
    for( auto id : tokens_list )
 | 
			
		||||
    {
 | 
			
		||||
        printf( "%s" , llama_token_to_str( ctx , id ) );
 | 
			
		||||
    for (auto id : tokens_list) {
 | 
			
		||||
        fprintf(stderr, "%s", llama_token_to_str(ctx, id));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fflush(stdout);
 | 
			
		||||
    fflush(stderr);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // Main prediction loop :
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // main loop
 | 
			
		||||
 | 
			
		||||
    // The LLM keeps a contextual cache memory of previous token evaluation.
 | 
			
		||||
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
 | 
			
		||||
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
 | 
			
		||||
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
 | 
			
		||||
 | 
			
		||||
    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
 | 
			
		||||
    {
 | 
			
		||||
        //---------------------------------
 | 
			
		||||
        // Evaluate the tokens :
 | 
			
		||||
        //---------------------------------
 | 
			
		||||
    while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
 | 
			
		||||
        // evaluate the transformer
 | 
			
		||||
 | 
			
		||||
        if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
 | 
			
		||||
        {
 | 
			
		||||
            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
 | 
			
		||||
        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
 | 
			
		||||
            fprintf(stderr, "%s : failed to eval\n", __func__);
 | 
			
		||||
            return 1;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        tokens_list.clear();
 | 
			
		||||
 | 
			
		||||
        //---------------------------------
 | 
			
		||||
        // Select the best prediction :
 | 
			
		||||
        //---------------------------------
 | 
			
		||||
        // sample the next token
 | 
			
		||||
 | 
			
		||||
        llama_token new_token_id = 0;
 | 
			
		||||
 | 
			
		||||
        auto logits  = llama_get_logits( ctx );
 | 
			
		||||
        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
 | 
			
		||||
        auto logits  = llama_get_logits(ctx);
 | 
			
		||||
        auto n_vocab = llama_n_vocab(ctx);
 | 
			
		||||
 | 
			
		||||
        std::vector<llama_token_data> candidates;
 | 
			
		||||
        candidates.reserve( n_vocab );
 | 
			
		||||
        candidates.reserve(n_vocab);
 | 
			
		||||
 | 
			
		||||
        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
 | 
			
		||||
        {
 | 
			
		||||
            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
 | 
			
		||||
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
 | 
			
		||||
            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 | 
			
		||||
 | 
			
		||||
        // Select it using the "Greedy sampling" method :
 | 
			
		||||
        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
 | 
			
		||||
 | 
			
		||||
        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
 | 
			
		||||
 | 
			
		||||
        // is it an end of stream ?
 | 
			
		||||
        if ( new_token_id == llama_token_eos() )
 | 
			
		||||
        {
 | 
			
		||||
        if (new_token_id == llama_token_eos()) {
 | 
			
		||||
            fprintf(stderr, " [end of text]\n");
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Print the new token :
 | 
			
		||||
        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
 | 
			
		||||
        fflush( stdout );
 | 
			
		||||
        // print the new token :
 | 
			
		||||
        printf("%s", llama_token_to_str(ctx, new_token_id));
 | 
			
		||||
        fflush(stdout);
 | 
			
		||||
 | 
			
		||||
        // Push this new token for next evaluation :
 | 
			
		||||
        tokens_list.push_back( new_token_id );
 | 
			
		||||
        // push this new token for next evaluation
 | 
			
		||||
        tokens_list.push_back(new_token_id);
 | 
			
		||||
 | 
			
		||||
    } // wend of main loop
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    llama_free( ctx );
 | 
			
		||||
    llama_free_model( model );
 | 
			
		||||
    llama_free(ctx);
 | 
			
		||||
    llama_free_model(model);
 | 
			
		||||
 | 
			
		||||
    llama_backend_free();
 | 
			
		||||
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// EOF
 | 
			
		||||
 
 | 
			
		||||
@@ -2,180 +2,125 @@
 | 
			
		||||
#define _GNU_SOURCE
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#include "common.h"
 | 
			
		||||
#include "llama.h"
 | 
			
		||||
#include "build-info.h"
 | 
			
		||||
 | 
			
		||||
#include <cassert>
 | 
			
		||||
#include <cinttypes>
 | 
			
		||||
#include "common.h"
 | 
			
		||||
#include "llama.h"
 | 
			
		||||
 | 
			
		||||
#include <cmath>
 | 
			
		||||
#include <cstdio>
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <ctime>
 | 
			
		||||
#include <fstream>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <string>
 | 
			
		||||
#include <vector>
 | 
			
		||||
 | 
			
		||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 | 
			
		||||
#include <signal.h>
 | 
			
		||||
#include <unistd.h>
 | 
			
		||||
#elif defined (_WIN32)
 | 
			
		||||
#define WIN32_LEAN_AND_MEAN
 | 
			
		||||
#define NOMINMAX
 | 
			
		||||
#include <windows.h>
 | 
			
		||||
#include <signal.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main(int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
int main(int argc, char ** argv) {
 | 
			
		||||
    gpt_params params;
 | 
			
		||||
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // Print help :
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
 | 
			
		||||
    if ( argc == 1 || argv[1][0] == '-' )
 | 
			
		||||
    {
 | 
			
		||||
        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
 | 
			
		||||
    if (argc == 1 || argv[1][0] == '-') {
 | 
			
		||||
        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
 | 
			
		||||
        return 1 ;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // Load parameters :
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
 | 
			
		||||
    if ( argc >= 2 )
 | 
			
		||||
    {
 | 
			
		||||
    if (argc >= 2) {
 | 
			
		||||
        params.model = argv[1];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if ( argc >= 3 )
 | 
			
		||||
    {
 | 
			
		||||
    if (argc >= 3) {
 | 
			
		||||
        params.prompt = argv[2];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if ( params.prompt.empty() )
 | 
			
		||||
    {
 | 
			
		||||
    if (params.prompt.empty()) {
 | 
			
		||||
        params.prompt = "Hello my name is";
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // Init LLM :
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // init LLM
 | 
			
		||||
 | 
			
		||||
    llama_backend_init(params.numa);
 | 
			
		||||
 | 
			
		||||
    llama_model * model;
 | 
			
		||||
    llama_context * ctx;
 | 
			
		||||
 | 
			
		||||
    std::tie(model, ctx) = llama_init_from_gpt_params( params );
 | 
			
		||||
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
 | 
			
		||||
 | 
			
		||||
    if ( model == NULL )
 | 
			
		||||
    {
 | 
			
		||||
        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
 | 
			
		||||
    if (model == NULL) {
 | 
			
		||||
        fprintf(stderr, "%s: error: unable to load model\n", __func__);
 | 
			
		||||
        return 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // Tokenize the prompt :
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // tokenize the prompt
 | 
			
		||||
 | 
			
		||||
    std::vector<llama_token> tokens_list;
 | 
			
		||||
    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
 | 
			
		||||
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
 | 
			
		||||
 | 
			
		||||
    const int max_context_size     = llama_n_ctx( ctx );
 | 
			
		||||
    const int max_tokens_list_size = max_context_size - 4 ;
 | 
			
		||||
    const int max_context_size     = llama_n_ctx(ctx);
 | 
			
		||||
    const int max_tokens_list_size = max_context_size - 4;
 | 
			
		||||
 | 
			
		||||
    if ( (int)tokens_list.size() > max_tokens_list_size )
 | 
			
		||||
    {
 | 
			
		||||
        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
 | 
			
		||||
             __func__ , (int)tokens_list.size() , max_tokens_list_size );
 | 
			
		||||
    if ((int)tokens_list.size() > max_tokens_list_size) {
 | 
			
		||||
        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
 | 
			
		||||
        return 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fprintf( stderr, "\n\n" );
 | 
			
		||||
    fprintf(stderr, "\n\n");
 | 
			
		||||
 | 
			
		||||
    // Print the tokens from the prompt :
 | 
			
		||||
 | 
			
		||||
    for( auto id : tokens_list )
 | 
			
		||||
    {
 | 
			
		||||
        printf( "%s" , llama_token_to_str( ctx , id ) );
 | 
			
		||||
    for (auto id : tokens_list) {
 | 
			
		||||
        fprintf(stderr, "%s", llama_token_to_str(ctx, id));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fflush(stdout);
 | 
			
		||||
    fflush(stderr);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // Main prediction loop :
 | 
			
		||||
    //---------------------------------
 | 
			
		||||
    // main loop
 | 
			
		||||
 | 
			
		||||
    // The LLM keeps a contextual cache memory of previous token evaluation.
 | 
			
		||||
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
 | 
			
		||||
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
 | 
			
		||||
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
 | 
			
		||||
 | 
			
		||||
    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
 | 
			
		||||
    {
 | 
			
		||||
        //---------------------------------
 | 
			
		||||
        // Evaluate the tokens :
 | 
			
		||||
        //---------------------------------
 | 
			
		||||
    while (llama_get_kv_cache_token_count( ctx ) < max_context_size) {
 | 
			
		||||
        // evaluate the transformer
 | 
			
		||||
 | 
			
		||||
        if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
 | 
			
		||||
        {
 | 
			
		||||
            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
 | 
			
		||||
        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
 | 
			
		||||
            fprintf(stderr, "%s : failed to eval\n", __func__);
 | 
			
		||||
            return 1;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        tokens_list.clear();
 | 
			
		||||
 | 
			
		||||
        //---------------------------------
 | 
			
		||||
        // Select the best prediction :
 | 
			
		||||
        //---------------------------------
 | 
			
		||||
        // sample the next token
 | 
			
		||||
 | 
			
		||||
        llama_token new_token_id = 0;
 | 
			
		||||
 | 
			
		||||
        auto logits  = llama_get_logits( ctx );
 | 
			
		||||
        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
 | 
			
		||||
        auto logits  = llama_get_logits(ctx);
 | 
			
		||||
        auto n_vocab = llama_n_vocab(ctx);
 | 
			
		||||
 | 
			
		||||
        std::vector<llama_token_data> candidates;
 | 
			
		||||
        candidates.reserve( n_vocab );
 | 
			
		||||
        candidates.reserve(n_vocab);
 | 
			
		||||
 | 
			
		||||
        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
 | 
			
		||||
        {
 | 
			
		||||
            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
 | 
			
		||||
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
 | 
			
		||||
            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
 | 
			
		||||
 | 
			
		||||
        // Select it using the "Greedy sampling" method :
 | 
			
		||||
        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
 | 
			
		||||
 | 
			
		||||
        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
 | 
			
		||||
 | 
			
		||||
        // is it an end of stream ?
 | 
			
		||||
        if ( new_token_id == llama_token_eos() )
 | 
			
		||||
        {
 | 
			
		||||
        if (new_token_id == llama_token_eos()) {
 | 
			
		||||
            fprintf(stderr, " [end of text]\n");
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Print the new token :
 | 
			
		||||
        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
 | 
			
		||||
        fflush( stdout );
 | 
			
		||||
        // print the new token :
 | 
			
		||||
        printf("%s", llama_token_to_str(ctx, new_token_id));
 | 
			
		||||
        fflush(stdout);
 | 
			
		||||
 | 
			
		||||
        // Push this new token for next evaluation :
 | 
			
		||||
        tokens_list.push_back( new_token_id );
 | 
			
		||||
        // push this new token for next evaluation
 | 
			
		||||
        tokens_list.push_back(new_token_id);
 | 
			
		||||
 | 
			
		||||
    } // wend of main loop
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    llama_free( ctx );
 | 
			
		||||
    llama_free_model( model );
 | 
			
		||||
    llama_free(ctx);
 | 
			
		||||
    llama_free_model(model);
 | 
			
		||||
 | 
			
		||||
    llama_backend_free();
 | 
			
		||||
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// EOF
 | 
			
		||||
 
 | 
			
		||||
@@ -5,7 +5,9 @@
 | 
			
		||||
 | 
			
		||||
#ifndef GGUF_UTIL_H
 | 
			
		||||
#define GGUF_UTIL_H
 | 
			
		||||
 | 
			
		||||
#include "ggml.h"
 | 
			
		||||
 | 
			
		||||
#include <cstdio>
 | 
			
		||||
#include <cstdint>
 | 
			
		||||
#include <cerrno>
 | 
			
		||||
@@ -62,7 +64,6 @@ static std::string format(const char * fmt, ...) {
 | 
			
		||||
    return std::string(buf.data(), size);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<typename T>
 | 
			
		||||
static std::string to_string(const T & val) {
 | 
			
		||||
    std::stringstream ss;
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user