mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : sync gguf-llama.cpp with latest llama.cpp (#2608)
* llama : sync gguf-llama.cpp with latest llama.cpp * minor : indentation + assert * llama : refactor gguf_buffer and gguf_ctx_buffer * llama : minor
This commit is contained in:
		| @@ -8,14 +8,19 @@ | ||||
| #include <sstream> | ||||
| #include <fstream> | ||||
| #include <vector> | ||||
| /* | ||||
|  | ||||
| #undef MIN | ||||
| #undef MAX | ||||
| #define MIN(a, b) ((a) < (b) ? (a) : (b)) | ||||
| #define MAX(a, b) ((a) > (b) ? (a) : (b)) | ||||
|  | ||||
| template<typename T> | ||||
| static std::string to_string(const T & val) { | ||||
|     std::stringstream ss; | ||||
|     ss << val; | ||||
|     return ss.str(); | ||||
| } | ||||
| */ | ||||
|  | ||||
| void gguf_ex_write_str(std::ofstream & fout, const std::string & val) { | ||||
|     const int32_t n = val.size(); | ||||
|     fout.write((const char *) &n, sizeof(n)); | ||||
| @@ -377,28 +382,28 @@ bool gguf_ex_read_2(const std::string & fname) { | ||||
|  | ||||
|     struct gguf_file file(fname.c_str(), "rb"); | ||||
|     gguf_mmap data_mmap(&file, 0, false); | ||||
|  | ||||
|     const int n_tensors = gguf_get_n_tensors(ctx); | ||||
|  | ||||
|     for (int i = 0; i < n_tensors; ++i) { | ||||
|         const char * name             = gguf_get_tensor_name(ctx, i); | ||||
|         const size_t offset      = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); | ||||
|         const char * name   = gguf_get_tensor_name(ctx, i); | ||||
|         const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); | ||||
|  | ||||
|         struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); | ||||
|  | ||||
|         cur->data = static_cast<char *>(data_mmap.addr) + offset; | ||||
|  | ||||
|         // print first 10 elements | ||||
|     const float * data = (const float *) cur->data; | ||||
|         const float * data = (const float *) cur->data; | ||||
|  | ||||
|         printf("%s data[:10] : ", name); | ||||
|  | ||||
|         for (int j = 0; j < 10; ++j) { | ||||
|         for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) { | ||||
|             printf("%f ", data[j]); | ||||
|         } | ||||
|  | ||||
|         printf("\n\n"); | ||||
|     } | ||||
|  | ||||
| fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data)); | ||||
|     fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data)); | ||||
|  | ||||
|     ggml_free(ctx_data); | ||||
|     gguf_free(ctx); | ||||
|   | ||||
| @@ -38,6 +38,9 @@ struct ggml_metal_context; | ||||
| struct ggml_metal_context * ggml_metal_init(int n_cb); | ||||
| void ggml_metal_free(struct ggml_metal_context * ctx); | ||||
|  | ||||
| void * ggml_metal_host_malloc(size_t n); | ||||
| void   ggml_metal_host_free  (void * data); | ||||
|  | ||||
| // set the number of command buffers to use | ||||
| void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); | ||||
|  | ||||
|   | ||||
							
								
								
									
										15
									
								
								ggml-metal.m
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								ggml-metal.m
									
									
									
									
									
								
							| @@ -224,6 +224,21 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { | ||||
|     free(ctx); | ||||
| } | ||||
|  | ||||
| void * ggml_metal_host_malloc(size_t n) { | ||||
|     void * data = NULL; | ||||
|     const int result = posix_memalign((void **) &data, getpagesize(), n); | ||||
|     if (result != 0) { | ||||
|         fprintf(stderr, "%s: error: posix_memalign failed\n", __func__); | ||||
|         return NULL; | ||||
|     } | ||||
|  | ||||
|     return data; | ||||
| } | ||||
|  | ||||
| void ggml_metal_host_free(void * data) { | ||||
|     free(data); | ||||
| } | ||||
|  | ||||
| void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { | ||||
|     ctx->n_cb = n_cb; | ||||
| } | ||||
|   | ||||
							
								
								
									
										989
									
								
								gguf-llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										989
									
								
								gguf-llama.cpp
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										28
									
								
								gguf-llama.h
									
									
									
									
									
								
							
							
						
						
									
										28
									
								
								gguf-llama.h
									
									
									
									
									
								
							| @@ -41,10 +41,6 @@ | ||||
| #define LLAMA_SUPPORTS_GPU_OFFLOAD | ||||
| #endif | ||||
|  | ||||
| #ifndef LLAMA_DEFAULT_RMS_EPS | ||||
| #define LLAMA_DEFAULT_RMS_EPS 5e-6f | ||||
| #endif | ||||
|  | ||||
| #ifdef __cplusplus | ||||
| extern "C" { | ||||
| #endif | ||||
| @@ -74,12 +70,23 @@ extern "C" { | ||||
|  | ||||
|     typedef void (*llama_progress_callback)(float progress, void *ctx); | ||||
|  | ||||
|    struct llama_context_params { | ||||
|     enum llama_log_level { | ||||
|         LLAMA_LOG_LEVEL_ERROR = 2, | ||||
|         LLAMA_LOG_LEVEL_WARN  = 3, | ||||
|         LLAMA_LOG_LEVEL_INFO  = 4 | ||||
|     }; | ||||
|  | ||||
|     // Signature for logging events | ||||
|     // Note that text includes the new line character at the end for most events. | ||||
|     // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it | ||||
|     // if it exists. | ||||
|     // It might not exist for progress report where '.' is output repeatedly. | ||||
|     typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data); | ||||
|  | ||||
|     struct llama_context_params { | ||||
|         uint32_t seed;         // RNG seed, -1 for random | ||||
|         int32_t  n_ctx;        // text context | ||||
|         int32_t  n_batch;      // prompt processing batch size | ||||
|         int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams) | ||||
|         float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams) | ||||
|         int32_t  n_gpu_layers; // number of layers to store in VRAM | ||||
|         int32_t  main_gpu;     // the GPU that is used for scratch and small tensors | ||||
|  | ||||
| @@ -96,6 +103,7 @@ extern "C" { | ||||
|  | ||||
|         // Keep the booleans together to avoid misalignment during copy-by-value. | ||||
|         bool low_vram;   // if true, reduce VRAM usage at the cost of performance | ||||
|         bool mul_mat_q;  // if true, use experimental mul_mat_q kernels | ||||
|         bool f16_kv;     // use fp16 for KV cache | ||||
|         bool logits_all; // the llama_eval() call computes all logits, not just the last one | ||||
|         bool vocab_only; // only load the vocabulary, no weights | ||||
| @@ -129,7 +137,7 @@ extern "C" { | ||||
|     // model quantization parameters | ||||
|     typedef struct llama_model_quantize_params { | ||||
|         int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() | ||||
|         enum llama_ftype   ftype;    // quantize to this llama_ftype | ||||
|         enum llama_ftype ftype;      // quantize to this llama_ftype | ||||
|         bool allow_requantize;       // allow quantizing non-f32/f16 tensors | ||||
|         bool quantize_output_tensor; // quantize output.weight | ||||
|     } llama_model_quantize_params; | ||||
| @@ -182,6 +190,10 @@ extern "C" { | ||||
|         int32_t n_eval; | ||||
|     }; | ||||
|  | ||||
|     // Set callback for all future logging events. | ||||
|     // If this is not called, or NULL is supplied, everything is output on stderr. | ||||
|     LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); | ||||
|  | ||||
|     LLAMA_API int llama_max_devices(); | ||||
|  | ||||
|     LLAMA_API struct llama_context_params llama_context_default_params(); | ||||
|   | ||||
							
								
								
									
										97
									
								
								gguf-util.h
									
									
									
									
									
								
							
							
						
						
									
										97
									
								
								gguf-util.h
									
									
									
									
									
								
							| @@ -64,13 +64,6 @@ static std::string format(const char * fmt, ...) { | ||||
|     return std::string(buf.data(), size); | ||||
| } | ||||
|  | ||||
| template<typename T> | ||||
| static std::string to_string(const T & val) { | ||||
|     std::stringstream ss; | ||||
|     ss << val; | ||||
|     return ss.str(); | ||||
| } | ||||
|  | ||||
| // TODO: can we merge this one and gguf_context? | ||||
| struct gguf_file { | ||||
|     // use FILE * so we don't have to re-open the file to mmap | ||||
| @@ -474,94 +467,4 @@ struct gguf_mlock { | ||||
| #endif | ||||
| }; | ||||
|  | ||||
| // Replacement for std::vector<uint8_t> that doesn't require zero-initialization. | ||||
| struct gguf_buffer { | ||||
|     uint8_t * addr = NULL; | ||||
|     size_t size = 0; | ||||
|  | ||||
|     gguf_buffer() = default; | ||||
|  | ||||
|     void resize(size_t len) { | ||||
| #ifdef GGML_USE_METAL | ||||
|         free(addr); | ||||
|         int result = posix_memalign((void **) &addr, getpagesize(), len); | ||||
|         if (result == 0) { | ||||
|             memset(addr, 0, len); | ||||
|         } | ||||
|         else { | ||||
|             addr = NULL; | ||||
|         } | ||||
| #else | ||||
|         delete[] addr; | ||||
|         addr = new uint8_t[len]; | ||||
| #endif | ||||
|         size = len; | ||||
|     } | ||||
|  | ||||
|     ~gguf_buffer() { | ||||
| #ifdef GGML_USE_METAL | ||||
|         free(addr); | ||||
| #else | ||||
|         delete[] addr; | ||||
| #endif | ||||
|         addr = NULL; | ||||
|     } | ||||
|  | ||||
|     // disable copy and move | ||||
|     gguf_buffer(const gguf_buffer&) = delete; | ||||
|     gguf_buffer(gguf_buffer&&) = delete; | ||||
|     gguf_buffer& operator=(const gguf_buffer&) = delete; | ||||
|     gguf_buffer& operator=(gguf_buffer&&) = delete; | ||||
| }; | ||||
|  | ||||
| #ifdef GGML_USE_CUBLAS | ||||
| #include "ggml-cuda.h" | ||||
| struct gguf_ctx_buffer { | ||||
|     uint8_t * addr = NULL; | ||||
|     bool is_cuda; | ||||
|     size_t size = 0; | ||||
|  | ||||
|     gguf_ctx_buffer() = default; | ||||
|  | ||||
|     void resize(size_t size) { | ||||
|         free(); | ||||
|  | ||||
|         addr = (uint8_t *) ggml_cuda_host_malloc(size); | ||||
|         if (addr) { | ||||
|             is_cuda = true; | ||||
|         } | ||||
|         else { | ||||
|             // fall back to pageable memory | ||||
|             addr = new uint8_t[size]; | ||||
|             is_cuda = false; | ||||
|         } | ||||
|         this->size = size; | ||||
|     } | ||||
|  | ||||
|     void free() { | ||||
|         if (addr) { | ||||
|             if (is_cuda) { | ||||
|                 ggml_cuda_host_free(addr); | ||||
|             } | ||||
|             else { | ||||
|                 delete[] addr; | ||||
|             } | ||||
|         } | ||||
|         addr = NULL; | ||||
|     } | ||||
|  | ||||
|     ~gguf_ctx_buffer() { | ||||
|         free(); | ||||
|     } | ||||
|  | ||||
|     // disable copy and move | ||||
|     gguf_ctx_buffer(const gguf_ctx_buffer&) = delete; | ||||
|     gguf_ctx_buffer(gguf_ctx_buffer&&) = delete; | ||||
|     gguf_ctx_buffer& operator=(const gguf_ctx_buffer&) = delete; | ||||
|     gguf_ctx_buffer& operator=(gguf_ctx_buffer&&) = delete; | ||||
| }; | ||||
| #else | ||||
| typedef gguf_buffer gguf_ctx_buffer; | ||||
| #endif | ||||
|  | ||||
| #endif | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov