mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	simple : minor style changes
This commit is contained in:
		| @@ -2,17 +2,18 @@ | ||||
|  | ||||
| import gguf | ||||
| import gguf_namemap as tmap | ||||
|  | ||||
| import os | ||||
| import sys | ||||
| import struct | ||||
| import json | ||||
| import numpy as np | ||||
| import torch | ||||
|  | ||||
| from typing import Any, List | ||||
| from pathlib import Path | ||||
| import torch | ||||
| from sentencepiece import SentencePieceProcessor | ||||
|  | ||||
|  | ||||
| #NDArray = np.ndarray[Any, Any] | ||||
| # compatible with python < 3.9 | ||||
| NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' | ||||
| @@ -225,7 +226,7 @@ for part_name in part_names: | ||||
|             sys.exit() | ||||
|  | ||||
|         n_dims = len(data.shape) | ||||
|         data_dtype = data.dtype  | ||||
|         data_dtype = data.dtype | ||||
|  | ||||
|         # if f32 desired, convert any float16 to float32 | ||||
|         if ftype == 0 and data.dtype == np.float16: | ||||
| @@ -268,7 +269,6 @@ for part_name in part_names: | ||||
|     for name in model_part.keys(): | ||||
|         data = model_part[name] | ||||
|  | ||||
|      | ||||
|         old_dtype = data.dtype | ||||
|  | ||||
|         # we don't need these | ||||
| @@ -295,7 +295,7 @@ for part_name in part_names: | ||||
|             sys.exit() | ||||
|  | ||||
|         n_dims = len(data.shape) | ||||
|         data_dtype = data.dtype  | ||||
|         data_dtype = data.dtype | ||||
|  | ||||
|         # if f32 desired, convert any float16 to float32 | ||||
|         if ftype == 0 and data.dtype == np.float16: | ||||
|   | ||||
| @@ -6,177 +6,121 @@ | ||||
| #include "gguf-llama.h" | ||||
| #include "build-info.h" | ||||
|  | ||||
| #include <cassert> | ||||
| #include <cinttypes> | ||||
| #include <cmath> | ||||
| #include <cstdio> | ||||
| #include <cstring> | ||||
| #include <ctime> | ||||
| #include <fstream> | ||||
| #include <iostream> | ||||
| #include <string> | ||||
| #include <vector> | ||||
|  | ||||
| #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) | ||||
| #include <signal.h> | ||||
| #include <unistd.h> | ||||
| #elif defined (_WIN32) | ||||
| #define WIN32_LEAN_AND_MEAN | ||||
| #define NOMINMAX | ||||
| #include <windows.h> | ||||
| #include <signal.h> | ||||
| #endif | ||||
|  | ||||
|  | ||||
|  | ||||
| int main(int argc, char ** argv) | ||||
| { | ||||
| int main(int argc, char ** argv) { | ||||
|     gpt_params params; | ||||
|  | ||||
|     //--------------------------------- | ||||
|     // Print help : | ||||
|     //--------------------------------- | ||||
|  | ||||
|     if ( argc == 1 || argv[1][0] == '-' ) | ||||
|     { | ||||
|         printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] ); | ||||
|     if (argc == 1 || argv[1][0] == '-') { | ||||
|         printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]); | ||||
|         return 1 ; | ||||
|     } | ||||
|  | ||||
|     //--------------------------------- | ||||
|     // Load parameters : | ||||
|     //--------------------------------- | ||||
|  | ||||
|     if ( argc >= 2 ) | ||||
|     { | ||||
|     if (argc >= 2) { | ||||
|         params.model = argv[1]; | ||||
|     } | ||||
|  | ||||
|     if ( argc >= 3 ) | ||||
|     { | ||||
|     if (argc >= 3) { | ||||
|         params.prompt = argv[2]; | ||||
|     } | ||||
|  | ||||
|     if ( params.prompt.empty() ) | ||||
|     { | ||||
|     if (params.prompt.empty()) { | ||||
|         params.prompt = "Hello my name is"; | ||||
|     } | ||||
|  | ||||
|     //--------------------------------- | ||||
|     // Init LLM : | ||||
|     //--------------------------------- | ||||
|     // init LLM | ||||
|  | ||||
|     llama_backend_init(params.numa); | ||||
|  | ||||
|     llama_context_params ctx_params = llama_context_default_params(); | ||||
|  | ||||
|     llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params); | ||||
|      | ||||
|     if ( model == NULL ) | ||||
|     { | ||||
|         fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); | ||||
|  | ||||
|     if (model == NULL) { | ||||
|         fprintf(stderr , "%s: error: unable to load model\n" , __func__); | ||||
|         return 1; | ||||
|     } | ||||
|  | ||||
|     llama_context * ctx = llama_new_context_with_model(model, ctx_params); | ||||
|  | ||||
|     //--------------------------------- | ||||
|     // Tokenize the prompt : | ||||
|     //--------------------------------- | ||||
|     // tokenize the prompt | ||||
|  | ||||
|     std::vector<llama_token> tokens_list; | ||||
|     tokens_list = ::llama_tokenize( ctx , params.prompt , true ); | ||||
|     tokens_list = ::llama_tokenize(ctx, params.prompt, true); | ||||
|  | ||||
|     const int max_context_size     = llama_n_ctx( ctx ); | ||||
|     const int max_tokens_list_size = max_context_size - 4 ; | ||||
|     const int max_context_size     = llama_n_ctx(ctx); | ||||
|     const int max_tokens_list_size = max_context_size - 4; | ||||
|  | ||||
|     if ( (int)tokens_list.size() > max_tokens_list_size ) | ||||
|     { | ||||
|         fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" , | ||||
|              __func__ , (int)tokens_list.size() , max_tokens_list_size ); | ||||
|     if ((int)tokens_list.size() > max_tokens_list_size) { | ||||
|         fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size); | ||||
|         return 1; | ||||
|     } | ||||
|  | ||||
|     fprintf( stderr, "\n\n" ); | ||||
|     fprintf(stderr, "\n\n"); | ||||
|  | ||||
|     // Print the tokens from the prompt : | ||||
|  | ||||
|     for( auto id : tokens_list ) | ||||
|     { | ||||
|         printf( "%s" , llama_token_to_str( ctx , id ) ); | ||||
|     for (auto id : tokens_list) { | ||||
|         fprintf(stderr, "%s", llama_token_to_str(ctx, id)); | ||||
|     } | ||||
|  | ||||
|     fflush(stdout); | ||||
|     fflush(stderr); | ||||
|  | ||||
|  | ||||
|     //--------------------------------- | ||||
|     // Main prediction loop : | ||||
|     //--------------------------------- | ||||
|     // main loop | ||||
|  | ||||
|     // The LLM keeps a contextual cache memory of previous token evaluation. | ||||
|     // Usually, once this cache is full, it is required to recompute a compressed context based on previous | ||||
|     // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist | ||||
|     // example, we will just stop the loop once this cache is full or once an end of stream is detected. | ||||
|  | ||||
|     while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) | ||||
|     { | ||||
|         //--------------------------------- | ||||
|         // Evaluate the tokens : | ||||
|         //--------------------------------- | ||||
|     while (llama_get_kv_cache_token_count(ctx) < max_context_size) { | ||||
|         // evaluate the transformer | ||||
|  | ||||
|         if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) | ||||
|         { | ||||
|             fprintf( stderr,  "%s : failed to eval\n" , __func__ ); | ||||
|         if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) { | ||||
|             fprintf(stderr, "%s : failed to eval\n", __func__); | ||||
|             return 1; | ||||
|         } | ||||
|  | ||||
|         tokens_list.clear(); | ||||
|  | ||||
|         //--------------------------------- | ||||
|         // Select the best prediction : | ||||
|         //--------------------------------- | ||||
|         // sample the next token | ||||
|  | ||||
|         llama_token new_token_id = 0; | ||||
|  | ||||
|         auto logits  = llama_get_logits( ctx ); | ||||
|         auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens) | ||||
|         auto logits  = llama_get_logits(ctx); | ||||
|         auto n_vocab = llama_n_vocab(ctx); | ||||
|  | ||||
|         std::vector<llama_token_data> candidates; | ||||
|         candidates.reserve( n_vocab ); | ||||
|         candidates.reserve(n_vocab); | ||||
|  | ||||
|         for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ ) | ||||
|         { | ||||
|             candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } ); | ||||
|         for (llama_token token_id = 0; token_id < n_vocab; token_id++) { | ||||
|             candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); | ||||
|         } | ||||
|  | ||||
|         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; | ||||
|  | ||||
|         // Select it using the "Greedy sampling" method : | ||||
|         new_token_id = llama_sample_token_greedy( ctx , &candidates_p ); | ||||
|  | ||||
|         new_token_id = llama_sample_token_greedy(ctx , &candidates_p); | ||||
|  | ||||
|         // is it an end of stream ? | ||||
|         if ( new_token_id == llama_token_eos() ) | ||||
|         { | ||||
|         if (new_token_id == llama_token_eos()) { | ||||
|             fprintf(stderr, " [end of text]\n"); | ||||
|             break; | ||||
|         } | ||||
|  | ||||
|         // Print the new token : | ||||
|         printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); | ||||
|         fflush( stdout ); | ||||
|         // print the new token : | ||||
|         printf("%s", llama_token_to_str(ctx, new_token_id)); | ||||
|         fflush(stdout); | ||||
|  | ||||
|         // Push this new token for next evaluation : | ||||
|         tokens_list.push_back( new_token_id ); | ||||
|         // push this new token for next evaluation | ||||
|         tokens_list.push_back(new_token_id); | ||||
|  | ||||
|     } // wend of main loop | ||||
|     } | ||||
|  | ||||
|     llama_free( ctx ); | ||||
|     llama_free_model( model ); | ||||
|     llama_free(ctx); | ||||
|     llama_free_model(model); | ||||
|  | ||||
|     llama_backend_free(); | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| // EOF | ||||
|   | ||||
| @@ -2,180 +2,125 @@ | ||||
| #define _GNU_SOURCE | ||||
| #endif | ||||
|  | ||||
| #include "common.h" | ||||
| #include "llama.h" | ||||
| #include "build-info.h" | ||||
|  | ||||
| #include <cassert> | ||||
| #include <cinttypes> | ||||
| #include "common.h" | ||||
| #include "llama.h" | ||||
|  | ||||
| #include <cmath> | ||||
| #include <cstdio> | ||||
| #include <cstring> | ||||
| #include <ctime> | ||||
| #include <fstream> | ||||
| #include <iostream> | ||||
| #include <string> | ||||
| #include <vector> | ||||
|  | ||||
| #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) | ||||
| #include <signal.h> | ||||
| #include <unistd.h> | ||||
| #elif defined (_WIN32) | ||||
| #define WIN32_LEAN_AND_MEAN | ||||
| #define NOMINMAX | ||||
| #include <windows.h> | ||||
| #include <signal.h> | ||||
| #endif | ||||
|  | ||||
|  | ||||
|  | ||||
| int main(int argc, char ** argv) | ||||
| { | ||||
| int main(int argc, char ** argv) { | ||||
|     gpt_params params; | ||||
|  | ||||
|     //--------------------------------- | ||||
|     // Print help : | ||||
|     //--------------------------------- | ||||
|  | ||||
|     if ( argc == 1 || argv[1][0] == '-' ) | ||||
|     { | ||||
|         printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] ); | ||||
|     if (argc == 1 || argv[1][0] == '-') { | ||||
|         printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]); | ||||
|         return 1 ; | ||||
|     } | ||||
|  | ||||
|     //--------------------------------- | ||||
|     // Load parameters : | ||||
|     //--------------------------------- | ||||
|  | ||||
|     if ( argc >= 2 ) | ||||
|     { | ||||
|     if (argc >= 2) { | ||||
|         params.model = argv[1]; | ||||
|     } | ||||
|  | ||||
|     if ( argc >= 3 ) | ||||
|     { | ||||
|     if (argc >= 3) { | ||||
|         params.prompt = argv[2]; | ||||
|     } | ||||
|  | ||||
|     if ( params.prompt.empty() ) | ||||
|     { | ||||
|     if (params.prompt.empty()) { | ||||
|         params.prompt = "Hello my name is"; | ||||
|     } | ||||
|  | ||||
|     //--------------------------------- | ||||
|     // Init LLM : | ||||
|     //--------------------------------- | ||||
|     // init LLM | ||||
|  | ||||
|     llama_backend_init(params.numa); | ||||
|  | ||||
|     llama_model * model; | ||||
|     llama_context * ctx; | ||||
|  | ||||
|     std::tie(model, ctx) = llama_init_from_gpt_params( params ); | ||||
|     std::tie(model, ctx) = llama_init_from_gpt_params(params); | ||||
|  | ||||
|     if ( model == NULL ) | ||||
|     { | ||||
|         fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); | ||||
|     if (model == NULL) { | ||||
|         fprintf(stderr, "%s: error: unable to load model\n", __func__); | ||||
|         return 1; | ||||
|     } | ||||
|  | ||||
|     //--------------------------------- | ||||
|     // Tokenize the prompt : | ||||
|     //--------------------------------- | ||||
|     // tokenize the prompt | ||||
|  | ||||
|     std::vector<llama_token> tokens_list; | ||||
|     tokens_list = ::llama_tokenize( ctx , params.prompt , true ); | ||||
|     tokens_list = ::llama_tokenize(ctx, params.prompt, true); | ||||
|  | ||||
|     const int max_context_size     = llama_n_ctx( ctx ); | ||||
|     const int max_tokens_list_size = max_context_size - 4 ; | ||||
|     const int max_context_size     = llama_n_ctx(ctx); | ||||
|     const int max_tokens_list_size = max_context_size - 4; | ||||
|  | ||||
|     if ( (int)tokens_list.size() > max_tokens_list_size ) | ||||
|     { | ||||
|         fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" , | ||||
|              __func__ , (int)tokens_list.size() , max_tokens_list_size ); | ||||
|     if ((int)tokens_list.size() > max_tokens_list_size) { | ||||
|         fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size); | ||||
|         return 1; | ||||
|     } | ||||
|  | ||||
|     fprintf( stderr, "\n\n" ); | ||||
|     fprintf(stderr, "\n\n"); | ||||
|  | ||||
|     // Print the tokens from the prompt : | ||||
|  | ||||
|     for( auto id : tokens_list ) | ||||
|     { | ||||
|         printf( "%s" , llama_token_to_str( ctx , id ) ); | ||||
|     for (auto id : tokens_list) { | ||||
|         fprintf(stderr, "%s", llama_token_to_str(ctx, id)); | ||||
|     } | ||||
|  | ||||
|     fflush(stdout); | ||||
|     fflush(stderr); | ||||
|  | ||||
|  | ||||
|     //--------------------------------- | ||||
|     // Main prediction loop : | ||||
|     //--------------------------------- | ||||
|     // main loop | ||||
|  | ||||
|     // The LLM keeps a contextual cache memory of previous token evaluation. | ||||
|     // Usually, once this cache is full, it is required to recompute a compressed context based on previous | ||||
|     // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist | ||||
|     // example, we will just stop the loop once this cache is full or once an end of stream is detected. | ||||
|  | ||||
|     while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) | ||||
|     { | ||||
|         //--------------------------------- | ||||
|         // Evaluate the tokens : | ||||
|         //--------------------------------- | ||||
|     while (llama_get_kv_cache_token_count( ctx ) < max_context_size) { | ||||
|         // evaluate the transformer | ||||
|  | ||||
|         if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) | ||||
|         { | ||||
|             fprintf( stderr,  "%s : failed to eval\n" , __func__ ); | ||||
|         if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) { | ||||
|             fprintf(stderr, "%s : failed to eval\n", __func__); | ||||
|             return 1; | ||||
|         } | ||||
|  | ||||
|         tokens_list.clear(); | ||||
|  | ||||
|         //--------------------------------- | ||||
|         // Select the best prediction : | ||||
|         //--------------------------------- | ||||
|         // sample the next token | ||||
|  | ||||
|         llama_token new_token_id = 0; | ||||
|  | ||||
|         auto logits  = llama_get_logits( ctx ); | ||||
|         auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens) | ||||
|         auto logits  = llama_get_logits(ctx); | ||||
|         auto n_vocab = llama_n_vocab(ctx); | ||||
|  | ||||
|         std::vector<llama_token_data> candidates; | ||||
|         candidates.reserve( n_vocab ); | ||||
|         candidates.reserve(n_vocab); | ||||
|  | ||||
|         for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ ) | ||||
|         { | ||||
|             candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } ); | ||||
|         for (llama_token token_id = 0; token_id < n_vocab; token_id++) { | ||||
|             candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); | ||||
|         } | ||||
|  | ||||
|         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; | ||||
|  | ||||
|         // Select it using the "Greedy sampling" method : | ||||
|         new_token_id = llama_sample_token_greedy( ctx , &candidates_p ); | ||||
|  | ||||
|         new_token_id = llama_sample_token_greedy(ctx , &candidates_p); | ||||
|  | ||||
|         // is it an end of stream ? | ||||
|         if ( new_token_id == llama_token_eos() ) | ||||
|         { | ||||
|         if (new_token_id == llama_token_eos()) { | ||||
|             fprintf(stderr, " [end of text]\n"); | ||||
|             break; | ||||
|         } | ||||
|  | ||||
|         // Print the new token : | ||||
|         printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); | ||||
|         fflush( stdout ); | ||||
|         // print the new token : | ||||
|         printf("%s", llama_token_to_str(ctx, new_token_id)); | ||||
|         fflush(stdout); | ||||
|  | ||||
|         // Push this new token for next evaluation : | ||||
|         tokens_list.push_back( new_token_id ); | ||||
|         // push this new token for next evaluation | ||||
|         tokens_list.push_back(new_token_id); | ||||
|  | ||||
|     } // wend of main loop | ||||
|     } | ||||
|  | ||||
|     llama_free( ctx ); | ||||
|     llama_free_model( model ); | ||||
|     llama_free(ctx); | ||||
|     llama_free_model(model); | ||||
|  | ||||
|     llama_backend_free(); | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| // EOF | ||||
|   | ||||
| @@ -5,7 +5,9 @@ | ||||
|  | ||||
| #ifndef GGUF_UTIL_H | ||||
| #define GGUF_UTIL_H | ||||
|  | ||||
| #include "ggml.h" | ||||
|  | ||||
| #include <cstdio> | ||||
| #include <cstdint> | ||||
| #include <cerrno> | ||||
| @@ -62,7 +64,6 @@ static std::string format(const char * fmt, ...) { | ||||
|     return std::string(buf.data(), size); | ||||
| } | ||||
|  | ||||
|  | ||||
| template<typename T> | ||||
| static std::string to_string(const T & val) { | ||||
|     std::stringstream ss; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov