mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	simple : minor style changes
This commit is contained in:
		| @@ -2,17 +2,18 @@ | |||||||
|  |  | ||||||
| import gguf | import gguf | ||||||
| import gguf_namemap as tmap | import gguf_namemap as tmap | ||||||
|  |  | ||||||
| import os | import os | ||||||
| import sys | import sys | ||||||
| import struct | import struct | ||||||
| import json | import json | ||||||
| import numpy as np | import numpy as np | ||||||
|  | import torch | ||||||
|  |  | ||||||
| from typing import Any, List | from typing import Any, List | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| import torch |  | ||||||
| from sentencepiece import SentencePieceProcessor | from sentencepiece import SentencePieceProcessor | ||||||
|  |  | ||||||
|  |  | ||||||
| #NDArray = np.ndarray[Any, Any] | #NDArray = np.ndarray[Any, Any] | ||||||
| # compatible with python < 3.9 | # compatible with python < 3.9 | ||||||
| NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' | NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' | ||||||
| @@ -225,7 +226,7 @@ for part_name in part_names: | |||||||
|             sys.exit() |             sys.exit() | ||||||
|  |  | ||||||
|         n_dims = len(data.shape) |         n_dims = len(data.shape) | ||||||
|         data_dtype = data.dtype  |         data_dtype = data.dtype | ||||||
|  |  | ||||||
|         # if f32 desired, convert any float16 to float32 |         # if f32 desired, convert any float16 to float32 | ||||||
|         if ftype == 0 and data.dtype == np.float16: |         if ftype == 0 and data.dtype == np.float16: | ||||||
| @@ -268,7 +269,6 @@ for part_name in part_names: | |||||||
|     for name in model_part.keys(): |     for name in model_part.keys(): | ||||||
|         data = model_part[name] |         data = model_part[name] | ||||||
|  |  | ||||||
|      |  | ||||||
|         old_dtype = data.dtype |         old_dtype = data.dtype | ||||||
|  |  | ||||||
|         # we don't need these |         # we don't need these | ||||||
| @@ -295,7 +295,7 @@ for part_name in part_names: | |||||||
|             sys.exit() |             sys.exit() | ||||||
|  |  | ||||||
|         n_dims = len(data.shape) |         n_dims = len(data.shape) | ||||||
|         data_dtype = data.dtype  |         data_dtype = data.dtype | ||||||
|  |  | ||||||
|         # if f32 desired, convert any float16 to float32 |         # if f32 desired, convert any float16 to float32 | ||||||
|         if ftype == 0 and data.dtype == np.float16: |         if ftype == 0 and data.dtype == np.float16: | ||||||
|   | |||||||
| @@ -6,177 +6,121 @@ | |||||||
| #include "gguf-llama.h" | #include "gguf-llama.h" | ||||||
| #include "build-info.h" | #include "build-info.h" | ||||||
|  |  | ||||||
| #include <cassert> |  | ||||||
| #include <cinttypes> |  | ||||||
| #include <cmath> | #include <cmath> | ||||||
| #include <cstdio> | #include <cstdio> | ||||||
| #include <cstring> |  | ||||||
| #include <ctime> |  | ||||||
| #include <fstream> |  | ||||||
| #include <iostream> |  | ||||||
| #include <string> | #include <string> | ||||||
| #include <vector> | #include <vector> | ||||||
|  |  | ||||||
| #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) | int main(int argc, char ** argv) { | ||||||
| #include <signal.h> |  | ||||||
| #include <unistd.h> |  | ||||||
| #elif defined (_WIN32) |  | ||||||
| #define WIN32_LEAN_AND_MEAN |  | ||||||
| #define NOMINMAX |  | ||||||
| #include <windows.h> |  | ||||||
| #include <signal.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| int main(int argc, char ** argv) |  | ||||||
| { |  | ||||||
|     gpt_params params; |     gpt_params params; | ||||||
|  |  | ||||||
|     //--------------------------------- |     if (argc == 1 || argv[1][0] == '-') { | ||||||
|     // Print help : |         printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]); | ||||||
|     //--------------------------------- |  | ||||||
|  |  | ||||||
|     if ( argc == 1 || argv[1][0] == '-' ) |  | ||||||
|     { |  | ||||||
|         printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] ); |  | ||||||
|         return 1 ; |         return 1 ; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     //--------------------------------- |     if (argc >= 2) { | ||||||
|     // Load parameters : |  | ||||||
|     //--------------------------------- |  | ||||||
|  |  | ||||||
|     if ( argc >= 2 ) |  | ||||||
|     { |  | ||||||
|         params.model = argv[1]; |         params.model = argv[1]; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if ( argc >= 3 ) |     if (argc >= 3) { | ||||||
|     { |  | ||||||
|         params.prompt = argv[2]; |         params.prompt = argv[2]; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if ( params.prompt.empty() ) |     if (params.prompt.empty()) { | ||||||
|     { |  | ||||||
|         params.prompt = "Hello my name is"; |         params.prompt = "Hello my name is"; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     //--------------------------------- |     // init LLM | ||||||
|     // Init LLM : |  | ||||||
|     //--------------------------------- |  | ||||||
|  |  | ||||||
|     llama_backend_init(params.numa); |     llama_backend_init(params.numa); | ||||||
|  |  | ||||||
|     llama_context_params ctx_params = llama_context_default_params(); |     llama_context_params ctx_params = llama_context_default_params(); | ||||||
|  |  | ||||||
|     llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params); |     llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params); | ||||||
|      |  | ||||||
|     if ( model == NULL ) |     if (model == NULL) { | ||||||
|     { |         fprintf(stderr , "%s: error: unable to load model\n" , __func__); | ||||||
|         fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); |  | ||||||
|         return 1; |         return 1; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     llama_context * ctx = llama_new_context_with_model(model, ctx_params); |     llama_context * ctx = llama_new_context_with_model(model, ctx_params); | ||||||
|  |  | ||||||
|     //--------------------------------- |     // tokenize the prompt | ||||||
|     // Tokenize the prompt : |  | ||||||
|     //--------------------------------- |  | ||||||
|  |  | ||||||
|     std::vector<llama_token> tokens_list; |     std::vector<llama_token> tokens_list; | ||||||
|     tokens_list = ::llama_tokenize( ctx , params.prompt , true ); |     tokens_list = ::llama_tokenize(ctx, params.prompt, true); | ||||||
|  |  | ||||||
|     const int max_context_size     = llama_n_ctx( ctx ); |     const int max_context_size     = llama_n_ctx(ctx); | ||||||
|     const int max_tokens_list_size = max_context_size - 4 ; |     const int max_tokens_list_size = max_context_size - 4; | ||||||
|  |  | ||||||
|     if ( (int)tokens_list.size() > max_tokens_list_size ) |     if ((int)tokens_list.size() > max_tokens_list_size) { | ||||||
|     { |         fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size); | ||||||
|         fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" , |  | ||||||
|              __func__ , (int)tokens_list.size() , max_tokens_list_size ); |  | ||||||
|         return 1; |         return 1; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fprintf( stderr, "\n\n" ); |     fprintf(stderr, "\n\n"); | ||||||
|  |  | ||||||
|     // Print the tokens from the prompt : |     for (auto id : tokens_list) { | ||||||
|  |         fprintf(stderr, "%s", llama_token_to_str(ctx, id)); | ||||||
|     for( auto id : tokens_list ) |  | ||||||
|     { |  | ||||||
|         printf( "%s" , llama_token_to_str( ctx , id ) ); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fflush(stdout); |     fflush(stderr); | ||||||
|  |  | ||||||
|  |     // main loop | ||||||
|     //--------------------------------- |  | ||||||
|     // Main prediction loop : |  | ||||||
|     //--------------------------------- |  | ||||||
|  |  | ||||||
|     // The LLM keeps a contextual cache memory of previous token evaluation. |     // The LLM keeps a contextual cache memory of previous token evaluation. | ||||||
|     // Usually, once this cache is full, it is required to recompute a compressed context based on previous |     // Usually, once this cache is full, it is required to recompute a compressed context based on previous | ||||||
|     // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist |     // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist | ||||||
|     // example, we will just stop the loop once this cache is full or once an end of stream is detected. |     // example, we will just stop the loop once this cache is full or once an end of stream is detected. | ||||||
|  |  | ||||||
|     while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) |     while (llama_get_kv_cache_token_count(ctx) < max_context_size) { | ||||||
|     { |         // evaluate the transformer | ||||||
|         //--------------------------------- |  | ||||||
|         // Evaluate the tokens : |  | ||||||
|         //--------------------------------- |  | ||||||
|  |  | ||||||
|         if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) |         if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) { | ||||||
|         { |             fprintf(stderr, "%s : failed to eval\n", __func__); | ||||||
|             fprintf( stderr,  "%s : failed to eval\n" , __func__ ); |  | ||||||
|             return 1; |             return 1; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         tokens_list.clear(); |         tokens_list.clear(); | ||||||
|  |  | ||||||
|         //--------------------------------- |         // sample the next token | ||||||
|         // Select the best prediction : |  | ||||||
|         //--------------------------------- |  | ||||||
|  |  | ||||||
|         llama_token new_token_id = 0; |         llama_token new_token_id = 0; | ||||||
|  |  | ||||||
|         auto logits  = llama_get_logits( ctx ); |         auto logits  = llama_get_logits(ctx); | ||||||
|         auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens) |         auto n_vocab = llama_n_vocab(ctx); | ||||||
|  |  | ||||||
|         std::vector<llama_token_data> candidates; |         std::vector<llama_token_data> candidates; | ||||||
|         candidates.reserve( n_vocab ); |         candidates.reserve(n_vocab); | ||||||
|  |  | ||||||
|         for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ ) |         for (llama_token token_id = 0; token_id < n_vocab; token_id++) { | ||||||
|         { |             candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); | ||||||
|             candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } ); |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; |         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; | ||||||
|  |  | ||||||
|         // Select it using the "Greedy sampling" method : |         new_token_id = llama_sample_token_greedy(ctx , &candidates_p); | ||||||
|         new_token_id = llama_sample_token_greedy( ctx , &candidates_p ); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|         // is it an end of stream ? |         // is it an end of stream ? | ||||||
|         if ( new_token_id == llama_token_eos() ) |         if (new_token_id == llama_token_eos()) { | ||||||
|         { |  | ||||||
|             fprintf(stderr, " [end of text]\n"); |             fprintf(stderr, " [end of text]\n"); | ||||||
|             break; |             break; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // Print the new token : |         // print the new token : | ||||||
|         printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); |         printf("%s", llama_token_to_str(ctx, new_token_id)); | ||||||
|         fflush( stdout ); |         fflush(stdout); | ||||||
|  |  | ||||||
|         // Push this new token for next evaluation : |         // push this new token for next evaluation | ||||||
|         tokens_list.push_back( new_token_id ); |         tokens_list.push_back(new_token_id); | ||||||
|  |  | ||||||
|     } // wend of main loop |     } | ||||||
|  |  | ||||||
|     llama_free( ctx ); |     llama_free(ctx); | ||||||
|     llama_free_model( model ); |     llama_free_model(model); | ||||||
|  |  | ||||||
|     llama_backend_free(); |     llama_backend_free(); | ||||||
|  |  | ||||||
|     return 0; |     return 0; | ||||||
| } | } | ||||||
|  |  | ||||||
| // EOF |  | ||||||
|   | |||||||
| @@ -2,180 +2,125 @@ | |||||||
| #define _GNU_SOURCE | #define _GNU_SOURCE | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #include "common.h" |  | ||||||
| #include "llama.h" |  | ||||||
| #include "build-info.h" | #include "build-info.h" | ||||||
|  |  | ||||||
| #include <cassert> | #include "common.h" | ||||||
| #include <cinttypes> | #include "llama.h" | ||||||
|  |  | ||||||
| #include <cmath> | #include <cmath> | ||||||
| #include <cstdio> | #include <cstdio> | ||||||
| #include <cstring> |  | ||||||
| #include <ctime> |  | ||||||
| #include <fstream> |  | ||||||
| #include <iostream> |  | ||||||
| #include <string> | #include <string> | ||||||
| #include <vector> | #include <vector> | ||||||
|  |  | ||||||
| #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) | int main(int argc, char ** argv) { | ||||||
| #include <signal.h> |  | ||||||
| #include <unistd.h> |  | ||||||
| #elif defined (_WIN32) |  | ||||||
| #define WIN32_LEAN_AND_MEAN |  | ||||||
| #define NOMINMAX |  | ||||||
| #include <windows.h> |  | ||||||
| #include <signal.h> |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| int main(int argc, char ** argv) |  | ||||||
| { |  | ||||||
|     gpt_params params; |     gpt_params params; | ||||||
|  |  | ||||||
|     //--------------------------------- |     if (argc == 1 || argv[1][0] == '-') { | ||||||
|     // Print help : |         printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]); | ||||||
|     //--------------------------------- |  | ||||||
|  |  | ||||||
|     if ( argc == 1 || argv[1][0] == '-' ) |  | ||||||
|     { |  | ||||||
|         printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] ); |  | ||||||
|         return 1 ; |         return 1 ; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     //--------------------------------- |     if (argc >= 2) { | ||||||
|     // Load parameters : |  | ||||||
|     //--------------------------------- |  | ||||||
|  |  | ||||||
|     if ( argc >= 2 ) |  | ||||||
|     { |  | ||||||
|         params.model = argv[1]; |         params.model = argv[1]; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if ( argc >= 3 ) |     if (argc >= 3) { | ||||||
|     { |  | ||||||
|         params.prompt = argv[2]; |         params.prompt = argv[2]; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if ( params.prompt.empty() ) |     if (params.prompt.empty()) { | ||||||
|     { |  | ||||||
|         params.prompt = "Hello my name is"; |         params.prompt = "Hello my name is"; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     //--------------------------------- |     // init LLM | ||||||
|     // Init LLM : |  | ||||||
|     //--------------------------------- |  | ||||||
|  |  | ||||||
|     llama_backend_init(params.numa); |     llama_backend_init(params.numa); | ||||||
|  |  | ||||||
|     llama_model * model; |     llama_model * model; | ||||||
|     llama_context * ctx; |     llama_context * ctx; | ||||||
|  |  | ||||||
|     std::tie(model, ctx) = llama_init_from_gpt_params( params ); |     std::tie(model, ctx) = llama_init_from_gpt_params(params); | ||||||
|  |  | ||||||
|     if ( model == NULL ) |     if (model == NULL) { | ||||||
|     { |         fprintf(stderr, "%s: error: unable to load model\n", __func__); | ||||||
|         fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); |  | ||||||
|         return 1; |         return 1; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     //--------------------------------- |     // tokenize the prompt | ||||||
|     // Tokenize the prompt : |  | ||||||
|     //--------------------------------- |  | ||||||
|  |  | ||||||
|     std::vector<llama_token> tokens_list; |     std::vector<llama_token> tokens_list; | ||||||
|     tokens_list = ::llama_tokenize( ctx , params.prompt , true ); |     tokens_list = ::llama_tokenize(ctx, params.prompt, true); | ||||||
|  |  | ||||||
|     const int max_context_size     = llama_n_ctx( ctx ); |     const int max_context_size     = llama_n_ctx(ctx); | ||||||
|     const int max_tokens_list_size = max_context_size - 4 ; |     const int max_tokens_list_size = max_context_size - 4; | ||||||
|  |  | ||||||
|     if ( (int)tokens_list.size() > max_tokens_list_size ) |     if ((int)tokens_list.size() > max_tokens_list_size) { | ||||||
|     { |         fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size); | ||||||
|         fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" , |  | ||||||
|              __func__ , (int)tokens_list.size() , max_tokens_list_size ); |  | ||||||
|         return 1; |         return 1; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fprintf( stderr, "\n\n" ); |     fprintf(stderr, "\n\n"); | ||||||
|  |  | ||||||
|     // Print the tokens from the prompt : |     for (auto id : tokens_list) { | ||||||
|  |         fprintf(stderr, "%s", llama_token_to_str(ctx, id)); | ||||||
|     for( auto id : tokens_list ) |  | ||||||
|     { |  | ||||||
|         printf( "%s" , llama_token_to_str( ctx , id ) ); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fflush(stdout); |     fflush(stderr); | ||||||
|  |  | ||||||
|  |     // main loop | ||||||
|     //--------------------------------- |  | ||||||
|     // Main prediction loop : |  | ||||||
|     //--------------------------------- |  | ||||||
|  |  | ||||||
|     // The LLM keeps a contextual cache memory of previous token evaluation. |     // The LLM keeps a contextual cache memory of previous token evaluation. | ||||||
|     // Usually, once this cache is full, it is required to recompute a compressed context based on previous |     // Usually, once this cache is full, it is required to recompute a compressed context based on previous | ||||||
|     // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist |     // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist | ||||||
|     // example, we will just stop the loop once this cache is full or once an end of stream is detected. |     // example, we will just stop the loop once this cache is full or once an end of stream is detected. | ||||||
|  |  | ||||||
|     while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) |     while (llama_get_kv_cache_token_count( ctx ) < max_context_size) { | ||||||
|     { |         // evaluate the transformer | ||||||
|         //--------------------------------- |  | ||||||
|         // Evaluate the tokens : |  | ||||||
|         //--------------------------------- |  | ||||||
|  |  | ||||||
|         if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) |         if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) { | ||||||
|         { |             fprintf(stderr, "%s : failed to eval\n", __func__); | ||||||
|             fprintf( stderr,  "%s : failed to eval\n" , __func__ ); |  | ||||||
|             return 1; |             return 1; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         tokens_list.clear(); |         tokens_list.clear(); | ||||||
|  |  | ||||||
|         //--------------------------------- |         // sample the next token | ||||||
|         // Select the best prediction : |  | ||||||
|         //--------------------------------- |  | ||||||
|  |  | ||||||
|         llama_token new_token_id = 0; |         llama_token new_token_id = 0; | ||||||
|  |  | ||||||
|         auto logits  = llama_get_logits( ctx ); |         auto logits  = llama_get_logits(ctx); | ||||||
|         auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens) |         auto n_vocab = llama_n_vocab(ctx); | ||||||
|  |  | ||||||
|         std::vector<llama_token_data> candidates; |         std::vector<llama_token_data> candidates; | ||||||
|         candidates.reserve( n_vocab ); |         candidates.reserve(n_vocab); | ||||||
|  |  | ||||||
|         for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ ) |         for (llama_token token_id = 0; token_id < n_vocab; token_id++) { | ||||||
|         { |             candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); | ||||||
|             candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } ); |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; |         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; | ||||||
|  |  | ||||||
|         // Select it using the "Greedy sampling" method : |         new_token_id = llama_sample_token_greedy(ctx , &candidates_p); | ||||||
|         new_token_id = llama_sample_token_greedy( ctx , &candidates_p ); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|         // is it an end of stream ? |         // is it an end of stream ? | ||||||
|         if ( new_token_id == llama_token_eos() ) |         if (new_token_id == llama_token_eos()) { | ||||||
|         { |  | ||||||
|             fprintf(stderr, " [end of text]\n"); |             fprintf(stderr, " [end of text]\n"); | ||||||
|             break; |             break; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // Print the new token : |         // print the new token : | ||||||
|         printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); |         printf("%s", llama_token_to_str(ctx, new_token_id)); | ||||||
|         fflush( stdout ); |         fflush(stdout); | ||||||
|  |  | ||||||
|         // Push this new token for next evaluation : |         // push this new token for next evaluation | ||||||
|         tokens_list.push_back( new_token_id ); |         tokens_list.push_back(new_token_id); | ||||||
|  |  | ||||||
|     } // wend of main loop |     } | ||||||
|  |  | ||||||
|     llama_free( ctx ); |     llama_free(ctx); | ||||||
|     llama_free_model( model ); |     llama_free_model(model); | ||||||
|  |  | ||||||
|     llama_backend_free(); |     llama_backend_free(); | ||||||
|  |  | ||||||
|     return 0; |     return 0; | ||||||
| } | } | ||||||
|  |  | ||||||
| // EOF |  | ||||||
|   | |||||||
| @@ -5,7 +5,9 @@ | |||||||
|  |  | ||||||
| #ifndef GGUF_UTIL_H | #ifndef GGUF_UTIL_H | ||||||
| #define GGUF_UTIL_H | #define GGUF_UTIL_H | ||||||
|  |  | ||||||
| #include "ggml.h" | #include "ggml.h" | ||||||
|  |  | ||||||
| #include <cstdio> | #include <cstdio> | ||||||
| #include <cstdint> | #include <cstdint> | ||||||
| #include <cerrno> | #include <cerrno> | ||||||
| @@ -62,7 +64,6 @@ static std::string format(const char * fmt, ...) { | |||||||
|     return std::string(buf.data(), size); |     return std::string(buf.data(), size); | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template<typename T> | template<typename T> | ||||||
| static std::string to_string(const T & val) { | static std::string to_string(const T & val) { | ||||||
|     std::stringstream ss; |     std::stringstream ss; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov